diff --git a/litellm/__init__.py b/litellm/__init__.py
index db41a0fc5..b84ad602e 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -375,6 +375,7 @@ from .integrations import *
 from .exceptions import (
     AuthenticationError,
     InvalidRequestError,
+    BadRequestError,
     RateLimitError,
     ServiceUnavailableError,
     OpenAIError,
diff --git a/litellm/exceptions.py b/litellm/exceptions.py
index 9567fe799..03240fc6d 100644
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@@ -8,75 +8,82 @@
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 
 ## LiteLLM versions of the OpenAI Exception Types
-from openai.error import (
+
+from openai import (
     AuthenticationError,
-    InvalidRequestError,
+    BadRequestError,
     RateLimitError,
-    ServiceUnavailableError,
+    APIStatusError,
     OpenAIError,
     APIError, 
-    Timeout, 
+    APITimeoutError, 
     APIConnectionError, 
 )
-
+import httpx
 
 class AuthenticationError(AuthenticationError):  # type: ignore
-    def __init__(self, message, llm_provider, model):
+    def __init__(self, message, llm_provider, model, response: httpx.Response):
         self.status_code = 401
         self.message = message
         self.llm_provider = llm_provider
         self.model = model
         super().__init__(
-            self.message
+            self.message,
+            response=response,
+            body=None
         )  # Call the base class constructor with the parameters it needs
 
-
-class InvalidRequestError(InvalidRequestError):  # type: ignore
-    def __init__(self, message, model, llm_provider):
+class BadRequestError(BadRequestError):  # type: ignore
+    def __init__(self, message, model, llm_provider, response: httpx.Response):
         self.status_code = 400
         self.message = message
         self.model = model
         self.llm_provider = llm_provider
         super().__init__(
-            self.message, f"{self.model}"
+            self.message,
+            response=response,
+            body=None
         )  # Call the base class constructor with the parameters it needs
 
-class Timeout(Timeout):  # type: ignore
-    def __init__(self, message, model, llm_provider):
+class Timeout(APITimeoutError):  # type: ignore
+    def __init__(self, message, model, llm_provider, request: httpx.Request):
         self.status_code = 408
         self.message = message
         self.model = model
         self.llm_provider = llm_provider
         super().__init__(
-            self.message, f"{self.model}"
+            request=request
         )  # Call the base class constructor with the parameters it needs
 
-# sub class of invalid request error - meant to give more granularity for error handling context window exceeded errors
-class ContextWindowExceededError(InvalidRequestError):  # type: ignore
-    def __init__(self, message, model, llm_provider):
-        self.status_code = 400
-        self.message = message
-        self.model = model
-        self.llm_provider = llm_provider
-        super().__init__(
-            self.message, self.model, self.llm_provider
-        )  # Call the base class constructor with the parameters it needs
-
-
 class RateLimitError(RateLimitError):  # type: ignore
-    def __init__(self, message, llm_provider, model):
+    def __init__(self, message, llm_provider, model, response: httpx.Response):
         self.status_code = 429
         self.message = message
         self.llm_provider = llm_provider
         self.modle = model
         super().__init__(
-            self.message
+            self.message,
+            response=response,
+            body=None
         )  # Call the base class constructor with the parameters it needs
 
+# sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors
+class ContextWindowExceededError(BadRequestError):  # type: ignore
+    def __init__(self, message, model, llm_provider, response: httpx.Response):
+        self.status_code = 400
+        self.message = message
+        self.model = model
+        self.llm_provider = llm_provider
+        super().__init__(
+            message=self.message, 
+            model=self.model, 
+            llm_provider=self.llm_provider, 
+            response=response
+        )  # Call the base class constructor with the parameters it needs
 
-class ServiceUnavailableError(ServiceUnavailableError):  # type: ignore
+class ServiceUnavailableError(APIStatusError):  # type: ignore
     def __init__(self, message, llm_provider, model):
-        self.status_code = 500
+        self.status_code = 503
         self.message = message
         self.llm_provider = llm_provider
         self.model = model
@@ -87,13 +94,14 @@ class ServiceUnavailableError(ServiceUnavailableError):  # type: ignore
 
 # raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401
 class APIError(APIError): # type: ignore 
-    def __init__(self, status_code, message, llm_provider, model):
+    def __init__(self, status_code, message, llm_provider, model, request: httpx.Request):
         self.status_code = status_code 
         self.message = message
         self.llm_provider = llm_provider
         self.model = model
         super().__init__(
-            self.message
+            self.message,
+            request=request
         )
 
 # raised if an invalid request (not get, delete, put, post) is made
@@ -123,4 +131,15 @@ class BudgetExceededError(Exception):
         self.current_cost = current_cost
         self.max_budget = max_budget
         message = f"Budget has been exceeded! Current cost: {current_cost}, Max budget: {max_budget}"
-        super().__init__(message)
\ No newline at end of file
+        super().__init__(message)
+
+## DEPRECATED ## 
+class InvalidRequestError(BadRequestError):  # type: ignore
+    def __init__(self, message, model, llm_provider):
+        self.status_code = 400
+        self.message = message
+        self.model = model
+        self.llm_provider = llm_provider
+        super().__init__(
+            self.message, f"{self.model}"
+        )  # Call the base class constructor with the parameters it needs
diff --git a/litellm/llms/base.py b/litellm/llms/base.py
index cc03be919..a7710a164 100644
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@@ -1,16 +1,21 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
 import litellm 
-import requests, certifi, ssl
+import httpx, certifi, ssl
 
 class BaseLLM:
+    _client_session = None
     def create_client_session(self):
         if litellm.client_session: 
-            session = litellm.client_session
+            _client_session = litellm.client_session
         else: 
-            session = requests.Session()
-        
-        return session
+            _client_session = httpx.Client(timeout=600)
         
+        return _client_session
+    
+    def __exit__(self):
+        if hasattr(self, '_client_session'):
+            self._client_session.close()
+
     def validate_environment(self):  # set up the environment required to run the model
         pass
 
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 3eaaea450..fdc6de8fa 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1,14 +1,17 @@
 from typing import Optional, Union
-import types, requests
+import types
+import httpx
 from .base import BaseLLM
 from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, convert_to_model_response_object
 from typing import Callable, Optional
 import aiohttp
 
 class OpenAIError(Exception):
-    def __init__(self, status_code, message):
+    def __init__(self, status_code, message, request: httpx.Request, response: httpx.Response):
         self.status_code = status_code
         self.message = message
+        self.request = request
+        self.response = response
         super().__init__(
             self.message
         )  # Call the base class constructor with the parameters it needs
@@ -144,7 +147,7 @@ class OpenAITextCompletionConfig():
                 and v is not None}
 
 class OpenAIChatCompletion(BaseLLM):
-    _client_session: requests.Session
+    _client_session: httpx.Client
 
     def __init__(self) -> None:
         super().__init__()
@@ -200,18 +203,8 @@ class OpenAIChatCompletion(BaseLLM):
                             return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
                         else:
                             return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response)
-                    elif "stream" in optional_params and optional_params["stream"] == True:
-                        response = self._client_session.post(
-                            url=api_base,
-                            json=data,
-                            headers=headers,
-                            stream=optional_params["stream"]
-                        )
-                        if response.status_code != 200:
-                            raise OpenAIError(status_code=response.status_code, message=response.text)
-                            
-                        ## RESPONSE OBJECT
-                        return response.iter_lines()
+                    elif optional_params.get("stream", False):
+                        return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
                     else:
                         response = self._client_session.post(
                             url=api_base,
@@ -219,7 +212,7 @@ class OpenAIChatCompletion(BaseLLM):
                             headers=headers,
                         )
                         if response.status_code != 200:
-                            raise OpenAIError(status_code=response.status_code, message=response.text)
+                            raise OpenAIError(status_code=response.status_code, message=response.text, request=response.request, response=response)
                             
                         ## RESPONSE OBJECT
                         return convert_to_model_response_object(response_object=response.json(), model_response_object=model_response)
@@ -246,41 +239,64 @@ class OpenAIChatCompletion(BaseLLM):
             exception_mapping_worked = True
             raise e
         except Exception as e: 
-            if exception_mapping_worked: 
-                raise e
-            else: 
-                import traceback
-                raise OpenAIError(status_code=500, message=traceback.format_exc())
+            raise e
     
     async def acompletion(self, 
                           api_base: str, 
                           data: dict, headers: dict, 
                           model_response: ModelResponse): 
-        async with aiohttp.ClientSession() as session:
-            async with session.post(api_base, json=data, headers=headers, ssl=False) as response:
-                response_json = await response.json()
-                if response.status != 200:
-                    raise OpenAIError(status_code=response.status, message=response.text)
+        async with httpx.AsyncClient() as client:
+            response = await client.post(api_base, json=data, headers=headers) 
+            response_json = response.json()
+            if response.status != 200:
+                raise OpenAIError(status_code=response.status, message=response.text)
                 
 
-                ## RESPONSE OBJECT
-                return convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
+            ## RESPONSE OBJECT
+            return convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
+
+    def streaming(self,
+                  logging_obj,
+                  api_base: str, 
+                  data: dict, 
+                  headers: dict, 
+                  model_response: ModelResponse, 
+                  model: str
+    ):
+        with self._client_session.stream(
+                    url=f"{api_base}",
+                    json=data,
+                    headers=headers,
+                    method="POST"
+                ) as response: 
+                    if response.status_code != 200:
+                        raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
+                    
+                    completion_stream = response.iter_lines()
+                    streamwrapper = CustomStreamWrapper(completion_stream=completion_stream, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
+                    for transformed_chunk in streamwrapper:
+                        yield transformed_chunk
 
     async def async_streaming(self, 
                           logging_obj,
                           api_base: str, 
-                          data: dict, headers: dict, 
+                          data: dict, 
+                          headers: dict, 
                           model_response: ModelResponse, 
                           model: str):
-        async with aiohttp.ClientSession() as session:
-            async with session.post(api_base, json=data, headers=headers, ssl=False) as response:
-                # Check if the request was successful (status code 200)
-                if response.status != 200:
-                    raise OpenAIError(status_code=response.status, message=await response.text())
-
-                streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
-                async for transformed_chunk in streamwrapper:
-                    yield transformed_chunk
+        client = httpx.AsyncClient()
+        async with client.stream(
+                    url=f"{api_base}",
+                    json=data,
+                    headers=headers,
+                    method="POST"
+                ) as response: 
+            if response.status_code != 200:
+                raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
+            
+            streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="openai",logging_obj=logging_obj)
+            async for transformed_chunk in streamwrapper:
+                yield transformed_chunk
 
     def embedding(self,
                 model: str,
@@ -349,7 +365,7 @@ class OpenAIChatCompletion(BaseLLM):
 
 
 class OpenAITextCompletion(BaseLLM):
-    _client_session: requests.Session
+    _client_session: httpx.Client
 
     def __init__(self) -> None:
         super().__init__()
@@ -367,7 +383,7 @@ class OpenAITextCompletion(BaseLLM):
         try: 
             ## RESPONSE OBJECT
             if response_object is None or model_response_object is None:
-                raise OpenAIError(status_code=500, message="Error in response object format")
+                raise ValueError(message="Error in response object format")
             choice_list=[]
             for idx, choice in enumerate(response_object["choices"]): 
                 message = Message(content=choice["text"], role="assistant")
@@ -386,8 +402,8 @@ class OpenAITextCompletion(BaseLLM):
             
             model_response_object._hidden_params["original_response"] = response_object # track original response, if users make a litellm.text_completion() request, we can return the original response
             return model_response_object
-        except: 
-            OpenAIError(status_code=500, message="Invalid response object.")
+        except Exception as e: 
+            raise e
 
     def completion(self, 
                model: Optional[str]=None,
@@ -397,6 +413,7 @@ class OpenAITextCompletion(BaseLLM):
                api_key: Optional[str]=None,
                api_base: Optional[str]=None,
                logging_obj=None,
+               acompletion: bool = False,
                optional_params=None,
                litellm_params=None,
                logger_fn=None,
@@ -412,9 +429,6 @@ class OpenAITextCompletion(BaseLLM):
             api_base = f"{api_base}/completions"
 
             if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: 
-                # Note: internal logic - for enabling litellm.text_completion()
-                # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
-                # https://platform.openai.com/docs/api-reference/completions/create
                 prompt = messages[0]["content"]
             else:
                 prompt = " ".join([message["content"] for message in messages]) # type: ignore
@@ -431,19 +445,13 @@ class OpenAITextCompletion(BaseLLM):
                 api_key=api_key,
                 additional_args={"headers": headers, "api_base": api_base, "data": data},
             )
-
-            if "stream" in optional_params and optional_params["stream"] == True:
-                response = self._client_session.post(
-                    url=f"{api_base}",
-                    json=data,
-                    headers=headers,
-                    stream=optional_params["stream"]
-                )
-                if response.status_code != 200:
-                    raise OpenAIError(status_code=response.status_code, message=response.text)
-                    
-                ## RESPONSE OBJECT
-                return response.iter_lines()
+            if acompletion == True: 
+                if optional_params.get("stream", False):
+                    return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
+                else:
+                    return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response, prompt=prompt, api_key=api_key, logging_obj=logging_obj, model=model)
+            elif optional_params.get("stream", False):
+                return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
             else:
                 response = self._client_session.post(
                     url=f"{api_base}",
@@ -451,7 +459,7 @@ class OpenAITextCompletion(BaseLLM):
                     headers=headers,
                 )
                 if response.status_code != 200:
-                    raise OpenAIError(status_code=response.status_code, message=response.text)
+                    raise OpenAIError(status_code=response.status_code, message=response.text, request=self._client_session.request, response=response)
                 
                 ## LOGGING
                 logging_obj.post_call(
@@ -466,12 +474,76 @@ class OpenAITextCompletion(BaseLLM):
 
                 ## RESPONSE OBJECT
                 return self.convert_to_model_response_object(response_object=response.json(), model_response_object=model_response)
-        except OpenAIError as e: 
-            exception_mapping_worked = True
-            raise e
         except Exception as e: 
-            if exception_mapping_worked: 
-                raise e
-            else: 
-                import traceback
-                raise OpenAIError(status_code=500, message=traceback.format_exc())
+            raise e
+    
+    async def acompletion(self, 
+                        logging_obj,
+                        api_base: str, 
+                        data: dict, 
+                        headers: dict, 
+                        model_response: ModelResponse, 
+                        prompt: str, 
+                        api_key: str, 
+                        model: str): 
+        async with httpx.AsyncClient() as client:
+            response = await client.post(api_base, json=data, headers=headers) 
+            response_json = response.json()
+            if response.status_code != 200:
+                raise OpenAIError(status_code=response.status_code, message=response.text)
+                
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                original_response=response,
+                additional_args={
+                    "headers": headers,
+                    "api_base": api_base,
+                },
+            )
+
+            ## RESPONSE OBJECT
+            return self.convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
+
+    def streaming(self,
+                  logging_obj,
+                  api_base: str, 
+                  data: dict, 
+                  headers: dict, 
+                  model_response: ModelResponse, 
+                  model: str
+    ):
+        with self._client_session.stream(
+                    url=f"{api_base}",
+                    json=data,
+                    headers=headers,
+                    method="POST"
+                ) as response: 
+                    if response.status_code != 200:
+                        raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
+                    
+                    streamwrapper = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj)
+                    for transformed_chunk in streamwrapper:
+                        yield transformed_chunk
+
+    async def async_streaming(self, 
+                          logging_obj,
+                          api_base: str, 
+                          data: dict, 
+                          headers: dict, 
+                          model_response: ModelResponse, 
+                          model: str):
+        client = httpx.AsyncClient()
+        async with client.stream(
+                    url=f"{api_base}",
+                    json=data,
+                    headers=headers,
+                    method="POST"
+                ) as response: 
+            if response.status_code != 200:
+                raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
+            
+            streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj)
+            async for transformed_chunk in streamwrapper:
+                yield transformed_chunk
\ No newline at end of file
diff --git a/litellm/main.py b/litellm/main.py
index c7c421783..155012a35 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -138,8 +138,10 @@ async def acompletion(*args, **kwargs):
 
     _, custom_llm_provider, _, _ = get_llm_provider(model=model, api_base=kwargs.get("api_base", None))
 
-
-    if (custom_llm_provider == "openai" or custom_llm_provider == "azure" or custom_llm_provider == "custom_openai"): # currently implemented aiohttp calls for just azure and openai, soon all. 
+    if (custom_llm_provider == "openai" 
+        or custom_llm_provider == "azure" 
+        or custom_llm_provider == "custom_openai"
+        or custom_llm_provider == "text-completion-openai"): # currently implemented aiohttp calls for just azure and openai, soon all. 
         if kwargs.get("stream", False): 
             response = completion(*args, **kwargs)
         else:
@@ -161,7 +163,7 @@ async def acompletion(*args, **kwargs):
             line
             async for line in response
         )
-    else:
+    else: 
         end_time = datetime.datetime.now()
         # [OPTIONAL] ADD TO CACHE
         if litellm.caching or litellm.caching_with_models or litellm.cache != None: # user init a cache object
@@ -596,15 +598,16 @@ def completion(
                 print_verbose=print_verbose,
                 api_key=api_key,
                 api_base=api_base,
+                acompletion=acompletion,
                 logging_obj=logging,
                 optional_params=optional_params,
                 litellm_params=litellm_params,
                 logger_fn=logger_fn
             )
             
-            if "stream" in optional_params and optional_params["stream"] == True:
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
-                return response
+            # if "stream" in optional_params and optional_params["stream"] == True:
+            #     response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
+            #     return response
             response = model_response
         elif (
             "replicate" in model or 
diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py
index 5d9d44b20..ba1a375c8 100644
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@@ -28,15 +28,13 @@ def test_async_response():
         user_message = "Hello, how are you?"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="gpt-3.5-turbo", messages=messages)
-            print(f"response: {response}")
-            response = await acompletion(model="azure/chatgpt-v-2", messages=messages)
+            response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages)
             print(f"response: {response}")
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
 
     response = asyncio.run(test_get_response())
-# print(response)
+    print(response)
 # test_async_response()
 
 def test_get_response_streaming():
@@ -45,8 +43,7 @@ def test_get_response_streaming():
         user_message = "write a short poem in one sentence"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="azure/chatgpt-v-2", messages=messages, stream=True)
-            # response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True)
+            response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages, stream=True)
             print(type(response))
 
             import inspect
@@ -59,18 +56,17 @@ def test_get_response_streaming():
             async for chunk in response:
                 token = chunk["choices"][0]["delta"].get("content", "")
                 output += token
-            print(f"output: {output}")
             assert output is not None, "output cannot be None."
             assert isinstance(output, str), "output needs to be of type str"
-            assert len(output) > 0, f"Length of output needs to be greater than 0. {output}"
-
+            assert len(output) > 0, "Length of output needs to be greater than 0."
+            print(f'output: {output}')
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
         return response
     asyncio.run(test_async_call())
 
 
-test_get_response_streaming()
+# test_get_response_streaming()
 
 def test_get_response_non_openai_streaming():
     import asyncio
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 2e1c35261..ad48a09cc 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -9,7 +9,7 @@ sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
-from openai.error import Timeout
+from openai import Timeout
 import litellm
 from litellm import embedding, completion, completion_cost
 from litellm import RateLimitError
@@ -405,7 +405,6 @@ def test_completion_openai():
         litellm.api_key = os.environ['OPENAI_API_KEY']
         response = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=10, request_timeout=10)
         print("This is the response object\n", response)
-        print("\n\nThis is response ms:", response.response_ms)
 
         
         response_str = response["choices"][0]["message"]["content"]
@@ -422,14 +421,15 @@ def test_completion_openai():
         pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
-# test_completion_openai()
+test_completion_openai()
 
 def test_completion_text_openai():
     try:
-        litellm.set_verbose = True
+        # litellm.set_verbose = True
         response = completion(model="gpt-3.5-turbo-instruct", messages=messages)
-        print(response)
+        print(response["choices"][0]["message"]["content"])
     except Exception as e:
+        print(e)
         pytest.fail(f"Error occurred: {e}")
 # test_completion_text_openai()
 
diff --git a/litellm/tests/test_completion_with_retries.py b/litellm/tests/test_completion_with_retries.py
index 30ec9ec8c..d5d837e95 100644
--- a/litellm/tests/test_completion_with_retries.py
+++ b/litellm/tests/test_completion_with_retries.py
@@ -14,7 +14,7 @@ import litellm
 from litellm import completion_with_retries, completion
 from litellm import (
     AuthenticationError,
-    InvalidRequestError,
+    BadRequestError,
     RateLimitError,
     ServiceUnavailableError,
     OpenAIError,
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index bc375c4bf..5a5c22898 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -1,4 +1,8 @@
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
+try:
+    from openai import AuthenticationError, BadRequestError, RateLimitError, OpenAIError
+except: 
+    from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
+
 import os
 import sys
 import traceback
@@ -38,23 +42,24 @@ models = ["command-nightly"]
 # Test 1: Context Window Errors 
 @pytest.mark.parametrize("model", models)
 def test_context_window(model):
-    sample_text = "Say error 50 times" * 100000
+    sample_text = "Say error 50 times" * 10000
     messages = [{"content": sample_text, "role": "user"}]
-    print(f"model: {model}")
     try:
-        completion(model=model, messages=messages)
+        response = completion(model=model, messages=messages)
+        print(f"response: {response}")
+        print("FAILED!")
         pytest.fail(f"An exception occurred")
-    except ContextWindowExceededError:
-        pass
+    except ContextWindowExceededError as e:
+        print(f"Worked!")
     except RateLimitError:
-        pass
+        print("RateLimited!")
     except Exception as e: 
         print(f"{e}")
         pytest.fail(f"An error occcurred - {e}")
         
 @pytest.mark.parametrize("model", models)
 def test_context_window_with_fallbacks(model):
-    ctx_window_fallback_dict = {"command-nightly": "claude-2"}
+    ctx_window_fallback_dict = {"command-nightly": "claude-2", "gpt-3.5-turbo-instruct": "gpt-3.5-turbo-16k"}
     sample_text = "how does a court case get to the Supreme Court?" * 1000
     messages = [{"content": sample_text, "role": "user"}]
 
@@ -62,8 +67,8 @@ def test_context_window_with_fallbacks(model):
 
 # for model in litellm.models_by_provider["bedrock"]:
 #     test_context_window(model=model)
-# test_context_window(model="gpt-3.5-turbo-instruct")
-# test_context_window_with_fallbacks(model="command-nightly")
+# test_context_window(model="gpt-3.5-turbo")
+# test_context_window_with_fallbacks(model="gpt-3.5-turbo")
 # Test 2: InvalidAuth Errors
 @pytest.mark.parametrize("model", models)
 def invalid_auth(model):  # set the model key to an invalid key, depending on the model
@@ -158,14 +163,14 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
 
 # for model in litellm.models_by_provider["bedrock"]:
 #     invalid_auth(model=model)
-# invalid_auth(model="gpt-3.5-turbo-instruct")
+# invalid_auth(model="gpt-3.5-turbo")
 
 # Test 3: Invalid Request Error 
 @pytest.mark.parametrize("model", models)
 def test_invalid_request_error(model):
     messages = [{"content": "hey, how's it going?", "role": "user"}]
 
-    with pytest.raises(InvalidRequestError):
+    with pytest.raises(BadRequestError):
         completion(model=model, messages=messages, max_tokens="hello world")
 
 # test_invalid_request_error(model="gpt-3.5-turbo")
@@ -178,15 +183,16 @@ def test_invalid_request_error(model):
 #         response = completion(model=model, messages=messages)
 #     except RateLimitError:
 #         return True
-#     except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
-#         return True
+#     # except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+#     #     return True
 #     except Exception as e:
 #         print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
 #         traceback.print_exc()
 #         pass
 #     return False
 # # Repeat each model 500 times
-# extended_models = [model for model in models for _ in range(250)]
+# # extended_models = [model for model in models for _ in range(250)]
+# extended_models = ["gpt-3.5-turbo-instruct" for _ in range(250)]
 
 # def worker(model):
 #     return test_model_call(model)
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index c9b3b9a67..316309e04 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -11,7 +11,7 @@ sys.path.insert(
 from dotenv import load_dotenv
 load_dotenv()
 import litellm
-from litellm import completion, acompletion, AuthenticationError, InvalidRequestError, RateLimitError
+from litellm import completion, acompletion, AuthenticationError, BadRequestError, RateLimitError, ModelResponse
 
 litellm.logging = False
 litellm.set_verbose = False
@@ -47,38 +47,17 @@ first_openai_chunk_example = {
 
 def validate_first_format(chunk):
     # write a test to make sure chunk follows the same format as first_openai_chunk_example
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
-    assert "id" in chunk, "Chunk should have an 'id'."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
     assert isinstance(chunk['id'], str), "'id' should be a string."
-    
-    assert "object" in chunk, "Chunk should have an 'object'."
     assert isinstance(chunk['object'], str), "'object' should be a string."
-
-    assert "created" in chunk, "Chunk should have a 'created'."
     assert isinstance(chunk['created'], int), "'created' should be an integer."
-
-    assert "model" in chunk, "Chunk should have a 'model'."
     assert isinstance(chunk['model'], str), "'model' should be a string."
-
-    assert "choices" in chunk, "Chunk should have 'choices'."
     assert isinstance(chunk['choices'], list), "'choices' should be a list."
 
     for choice in chunk['choices']:
-        assert isinstance(choice, dict), "Each choice should be a dictionary."
-
-        assert "index" in choice, "Each choice should have 'index'."
         assert isinstance(choice['index'], int), "'index' should be an integer."
-
-        assert "delta" in choice, "Each choice should have 'delta'." 
-        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
-
-        assert "role" in choice['delta'], "'delta' should have a 'role'."
         assert isinstance(choice['delta']['role'], str), "'role' should be a string."
-
-        assert "content" in choice['delta'], "'delta' should have 'content'."
         assert isinstance(choice['delta']['content'], str), "'content' should be a string."
-
-        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
         assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 
 second_openai_chunk_example = {
@@ -98,35 +77,16 @@ second_openai_chunk_example = {
 }
 
 def validate_second_format(chunk):
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
-    assert "id" in chunk, "Chunk should have an 'id'."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
     assert isinstance(chunk['id'], str), "'id' should be a string."
-    
-    assert "object" in chunk, "Chunk should have an 'object'."
     assert isinstance(chunk['object'], str), "'object' should be a string."
-
-    assert "created" in chunk, "Chunk should have a 'created'."
     assert isinstance(chunk['created'], int), "'created' should be an integer."
-
-    assert "model" in chunk, "Chunk should have a 'model'."
     assert isinstance(chunk['model'], str), "'model' should be a string."
-
-    assert "choices" in chunk, "Chunk should have 'choices'."
     assert isinstance(chunk['choices'], list), "'choices' should be a list."
 
     for choice in chunk['choices']:
-        assert isinstance(choice, dict), "Each choice should be a dictionary."
-
-        assert "index" in choice, "Each choice should have 'index'."
         assert isinstance(choice['index'], int), "'index' should be an integer."
-
-        assert "delta" in choice, "Each choice should have 'delta'." 
-        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
-
-        assert "content" in choice['delta'], "'delta' should have 'content'."
         assert isinstance(choice['delta']['content'], str), "'content' should be a string."
-
-        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
         assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 
 last_openai_chunk_example = {
@@ -144,32 +104,15 @@ last_openai_chunk_example = {
 }
 
 def validate_last_format(chunk):
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
-    assert "id" in chunk, "Chunk should have an 'id'."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
     assert isinstance(chunk['id'], str), "'id' should be a string."
-    
-    assert "object" in chunk, "Chunk should have an 'object'."
     assert isinstance(chunk['object'], str), "'object' should be a string."
-
-    assert "created" in chunk, "Chunk should have a 'created'."
     assert isinstance(chunk['created'], int), "'created' should be an integer."
-
-    assert "model" in chunk, "Chunk should have a 'model'."
     assert isinstance(chunk['model'], str), "'model' should be a string."
-
-    assert "choices" in chunk, "Chunk should have 'choices'."
     assert isinstance(chunk['choices'], list), "'choices' should be a list."
 
     for choice in chunk['choices']:
-        assert isinstance(choice, dict), "Each choice should be a dictionary."
-
-        assert "index" in choice, "Each choice should have 'index'."
         assert isinstance(choice['index'], int), "'index' should be an integer."
-
-        assert "delta" in choice, "Each choice should have 'delta'." 
-        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
-
-        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
         assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string."
 
 def streaming_format_tests(idx, chunk):
@@ -188,6 +131,7 @@ def streaming_format_tests(idx, chunk):
     if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk
         validate_last_format(chunk=chunk)
         finished = True
+    print(f"chunk choices: {chunk['choices'][0]['delta']['content']}")
     if "content" in chunk["choices"][0]["delta"]:
         extracted_chunk = chunk["choices"][0]["delta"]["content"]
     print(f"extracted chunk: {extracted_chunk}")
@@ -549,7 +493,7 @@ def test_completion_claude_stream_bad_key():
         pytest.fail(f"Error occurred: {e}")
 
 
-test_completion_claude_stream_bad_key() 
+# test_completion_claude_stream_bad_key() 
 # test_completion_replicate_stream()
 
 # def test_completion_vertexai_stream():
@@ -824,7 +768,7 @@ def ai21_completion_call_bad_key():
         if complete_response.strip() == "": 
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
-    except InvalidRequestError as e: 
+    except Bad as e: 
         pass
     except:
         pytest.fail(f"error occurred: {traceback.format_exc()}")
@@ -885,7 +829,7 @@ def ai21_completion_call_bad_key():
 # test on openai completion call
 def test_openai_chat_completion_call():
     try:
-        litellm.set_verbose = True
+        litellm.set_verbose = False
         response = completion(
             model="gpt-3.5-turbo", messages=messages, stream=True
         )
@@ -904,7 +848,7 @@ def test_openai_chat_completion_call():
         print(f"error occurred: {traceback.format_exc()}")
         pass
 
-# test_openai_chat_completion_call()
+test_openai_chat_completion_call()
 
 def test_openai_chat_completion_complete_response_call():
     try:
@@ -928,6 +872,7 @@ def test_openai_text_completion_call():
         start_time = time.time()
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
+            print(f"chunk: {chunk}")
             complete_response += chunk
             if finished:
                 break
@@ -939,6 +884,8 @@ def test_openai_text_completion_call():
         print(f"error occurred: {traceback.format_exc()}")
         pass
 
+# test_openai_text_completion_call()
+
 # # test on together ai completion call - starcoder
 def test_together_ai_completion_call_starcoder():
     try:
@@ -992,7 +939,7 @@ def test_together_ai_completion_call_starcoder_bad_key():
         if complete_response == "":
             raise Exception("Empty response received")
         print(f"complete response: {complete_response}")
-    except InvalidRequestError as e:
+    except BadRequestError as e:
         pass
     except:
         print(f"error occurred: {traceback.format_exc()}")
diff --git a/litellm/timeout.py b/litellm/timeout.py
index a55f31dac..65af51b98 100644
--- a/litellm/timeout.py
+++ b/litellm/timeout.py
@@ -17,7 +17,10 @@ from concurrent import futures
 from inspect import iscoroutinefunction
 from functools import wraps
 from threading import Thread
-from openai.error import Timeout
+try: 
+    from openai import Timeout
+except:
+    from openai.error import Timeout
 
 
 def timeout(timeout_duration: float = 0.0, exception_to_raise=Timeout):
diff --git a/litellm/utils.py b/litellm/utils.py
index 281ad713e..c7a7fb292 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -39,11 +39,15 @@ from .integrations.weights_biases import WeightsBiasesLogger
 from .integrations.custom_logger import CustomLogger
 from .integrations.langfuse import LangFuseLogger
 from .integrations.litedebugger import LiteDebugger
-from openai.error import OpenAIError as OriginalError
-from openai.openai_object import OpenAIObject
+try:
+    from openai import OpenAIError as OriginalError
+    from openai._models import BaseModel as OpenAIObject
+except:
+    from openai.error import OpenAIError as OriginalError
+    from openai.openai_object import OpenAIObject
 from .exceptions import (
     AuthenticationError,
-    InvalidRequestError,
+    BadRequestError,
     RateLimitError,
     ServiceUnavailableError,
     OpenAIError,
@@ -53,7 +57,7 @@ from .exceptions import (
     APIError,
     BudgetExceededError
 )
-from typing import cast, List, Dict, Union, Optional
+from typing import cast, List, Dict, Union, Optional, Literal
 from .caching import Cache
 
 ####### ENVIRONMENT VARIABLES ####################
@@ -127,13 +131,41 @@ class Message(OpenAIObject):
         if function_call: 
             self.function_call = function_call
 
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+
 class Delta(OpenAIObject):
-    def __init__(self, content=None, logprobs=None, role=None, **params):
+    def __init__(self, content=None, role=None, **params):
         super(Delta, self).__init__(**params)
         if content is not None:
             self.content = content
         if role:
             self.role = role
+    
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
 
 
 class Choices(OpenAIObject):
@@ -148,6 +180,22 @@ class Choices(OpenAIObject):
             self.message = Message(content=None)
         else:
             self.message = message
+    
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
 
 class Usage(OpenAIObject):
     def __init__(self, prompt_tokens=None, completion_tokens=None, total_tokens=None, **params):
@@ -158,6 +206,22 @@ class Usage(OpenAIObject):
             self.completion_tokens = completion_tokens
         if total_tokens:
             self.total_tokens = total_tokens
+    
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+    
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
 
 class StreamingChoices(OpenAIObject):
     def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params):
@@ -168,47 +232,97 @@ class StreamingChoices(OpenAIObject):
             self.delta = delta
         else:
             self.delta = Delta()
+    
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+    
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
 
-class ModelResponse(OpenAIObject):
-    def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params):
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+
+class ModelResponse(OpenAIObject): 
+    id: str
+    """A unique identifier for the completion."""
+
+    choices: List[Union[Choices, StreamingChoices]]
+    """The list of completion choices the model generated for the input prompt."""
+
+    created: int
+    """The Unix timestamp (in seconds) of when the completion was created."""
+
+    model: Optional[str] = None
+    """The model used for completion."""
+
+    object: str
+    """The object type, which is always "text_completion" """
+
+    system_fingerprint: Optional[str] = None
+    """This fingerprint represents the backend configuration that the model runs with.
+
+    Can be used in conjunction with the `seed` request parameter to understand when
+    backend changes have been made that might impact determinism.
+    """
+
+    usage: Optional[Usage] = None
+    """Usage statistics for the completion request."""
+
+    _hidden_params: dict = {}
+
+    def __init__(self, id=None, choices=None, created=None, model=None, object=None, system_fingerprint=None, usage=None, stream=False, response_ms=None, hidden_params=None, **params):
         if stream:
-            self.object = "chat.completion.chunk"
-            self.choices = [StreamingChoices()]
+            object = "chat.completion.chunk"
+            choices = [StreamingChoices()]
         else:
             if model in litellm.open_ai_embedding_models:
-                self.object = "embedding"
+                object = "embedding"
             else:
-                self.object = "chat.completion"
-            self.choices = [Choices()]
+                object = "chat.completion"
+            choices = [Choices()]
         if id is None:
-            self.id = _generate_id()
+            id = _generate_id()
         else:
-            self.id = id
+            id = id
         if created is None:
-            self.created = int(time.time())
+            created = int(time.time())
         else:
-            self.created = created
+            created = created
         if response_ms:
-            self._response_ms = response_ms
+            _response_ms = response_ms
         else:
-            self._response_ms = None
-        self.model = model
+            _response_ms = None
+        model = model
         if usage:
-            self.usage = usage
+            usage = usage
         else:
-            self.usage = Usage()
-        self._hidden_params = {} # used in case users want to access the original model response
-        super(ModelResponse, self).__init__(**params)
-
-    def to_dict_recursive(self):
-        d = super().to_dict_recursive()
-        d["choices"] = [choice.to_dict_recursive() for choice in self.choices]
-        return d
-
-    def cost(self):
-        # for non streaming responses
-        return completion_cost(completion_response=self)
+            usage = Usage()
+        if hidden_params:
+            self._hidden_params = hidden_params
+        super().__init__(id=id, choices=choices, created=created, model=model, object=object, system_fingerprint=system_fingerprint, usage=usage, **params)
+    
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+    
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+    
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
 
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
 class EmbeddingResponse(OpenAIObject):
     def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params):
         self.object = "list"
@@ -2771,7 +2885,7 @@ def valid_model(model):
             messages = [{"role": "user", "content": "Hello World"}]
             litellm.completion(model=model, messages=messages)
     except:
-        raise InvalidRequestError(message="", model=model, llm_provider="")
+        raise BadRequestError(message="", model=model, llm_provider="")
 
 def check_valid_key(model: str, api_key: str):
     """
@@ -2955,7 +3069,7 @@ def exception_type(
             exception_mapping_worked = True
             if custom_llm_provider == "openrouter":
                 if original_exception.http_status == 413:
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=str(original_exception),
                         model=model,
                         llm_provider="openrouter"
@@ -2975,19 +3089,21 @@ def exception_type(
             else:
                 exception_type = ""
             if custom_llm_provider == "openai" or custom_llm_provider == "text-completion-openai":
-                if "This model's maximum context length is" in error_str:
+                if "This model's maximum context length is" in error_str or "Request too large" in error_str:
                     exception_mapping_worked = True
                     raise ContextWindowExceededError(
                         message=f"OpenAIException - {original_exception.message}",
                         llm_provider="openai",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
-                elif "invalid_request_error" in error_str:
+                elif "invalid_request_error" in error_str and "Incorrect API key provided" not in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"OpenAIException - {original_exception.message}",
                         llm_provider="openai",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif hasattr(original_exception, "status_code"):
                     exception_mapping_worked = True
@@ -2996,21 +3112,24 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"OpenAIException - {original_exception.message}",
                             llm_provider="openai",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
                             message=f"OpenAIException - {original_exception.message}",
                             model=model,
-                            llm_provider="openai"
+                            llm_provider="openai",
+                            request=original_exception.request
                         )
                     if original_exception.status_code == 422:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"OpenAIException - {original_exception.message}",
                             model=model,
                             llm_provider="openai",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
@@ -3018,6 +3137,7 @@ def exception_type(
                             message=f"OpenAIException - {original_exception.message}",
                             model=model,
                             llm_provider="openai",
+                            response=original_exception.response
                         )
                     else:
                         exception_mapping_worked = True
@@ -3025,7 +3145,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"OpenAIException - {original_exception.message}",
                             llm_provider="openai",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
             elif custom_llm_provider == "anthropic":  # one of the anthropics
                 if hasattr(original_exception, "message"):
@@ -3034,14 +3155,16 @@ def exception_type(
                         raise ContextWindowExceededError(
                             message=original_exception.message, 
                             model=model,
-                            llm_provider="anthropic"
+                            llm_provider="anthropic",
+                            response=original_exception.response
                         )
                     if "Invalid API Key" in original_exception.message:
                         exception_mapping_worked = True
                         raise AuthenticationError(
                             message=original_exception.message, 
                             model=model,
-                            llm_provider="anthropic"
+                            llm_provider="anthropic",
+                            response=original_exception.response
                         )
                 if hasattr(original_exception, "status_code"):
                     print_verbose(f"status_code: {original_exception.status_code}")
@@ -3050,35 +3173,32 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"AnthropicException - {original_exception.message}",
                             llm_provider="anthropic",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
-                    elif original_exception.status_code == 400:
+                    elif original_exception.status_code == 400 or original_exception.status_code == 413:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"AnthropicException - {original_exception.message}",
                             model=model,
                             llm_provider="anthropic",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
-                            message=f"AnthropicException - {original_exception.message}",
-                            model=model,
-                            llm_provider="anthropic"
-                        )
-                    elif original_exception.status_code == 413:
-                        exception_mapping_worked = True
-                        raise InvalidRequestError(
                             message=f"AnthropicException - {original_exception.message}",
                             model=model,
                             llm_provider="anthropic",
+                            request=original_exception.request
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"AnthropicException - {original_exception.message}",
                             llm_provider="anthropic",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 500:
                         exception_mapping_worked = True
@@ -3093,7 +3213,8 @@ def exception_type(
                             status_code=original_exception.status_code,
                             message=f"AnthropicException - {original_exception.message}",
                             llm_provider="anthropic",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
             elif custom_llm_provider == "replicate":
                 if "Incorrect authentication token" in error_str:
@@ -3101,7 +3222,8 @@ def exception_type(
                     raise AuthenticationError(
                         message=f"ReplicateException - {error_str}",
                         llm_provider="replicate",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "input is too long" in error_str:
                     exception_mapping_worked = True
@@ -3109,20 +3231,23 @@ def exception_type(
                         message=f"ReplicateException - {error_str}",
                         model=model,
                         llm_provider="replicate",
+                        response=original_exception.response
                     )
                 elif exception_type == "ModelError":
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"ReplicateException - {error_str}",
                         model=model,
                         llm_provider="replicate",
+                        response=original_exception.response
                     )
                 elif "Request was throttled" in error_str:
                     exception_mapping_worked = True
                     raise RateLimitError(
                         message=f"ReplicateException - {error_str}",
                         llm_provider="replicate",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 401:
@@ -3130,35 +3255,32 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"ReplicateException - {original_exception.message}",
                             llm_provider="replicate",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
-                    elif original_exception.status_code == 400 or original_exception.status_code == 422:
+                    elif original_exception.status_code == 400 or original_exception.status_code == 422 or original_exception.status_code == 413:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"ReplicateException - {original_exception.message}",
                             model=model,
                             llm_provider="replicate",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
-                            message=f"ReplicateException - {original_exception.message}",
-                            model=model,
-                            llm_provider="replicate"
-                        )
-                    elif original_exception.status_code == 413:
-                        exception_mapping_worked = True
-                        raise InvalidRequestError(
                             message=f"ReplicateException - {original_exception.message}",
                             model=model,
                             llm_provider="replicate",
+                            request=original_exception.request
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"ReplicateException - {original_exception.message}",
                             llm_provider="replicate",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 500:
                         exception_mapping_worked = True
@@ -3172,7 +3294,8 @@ def exception_type(
                     status_code=500, 
                     message=f"ReplicateException - {str(original_exception)}",
                     llm_provider="replicate",
-                    model=model
+                    model=model,
+                    request=original_exception.request
                 )
             elif custom_llm_provider == "bedrock":
                 if "too many tokens" in error_str or "expected maxLength:" in error_str or "Input is too long" in error_str or "Too many input tokens" in error_str: 
@@ -3180,28 +3303,32 @@ def exception_type(
                     raise ContextWindowExceededError(
                         message=f"BedrockException: Context Window Error - {error_str}",
                         model=model, 
-                        llm_provider="bedrock"
+                        llm_provider="bedrock",
+                        response=original_exception.response
                     )
                 if "Malformed input request" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"BedrockException - {error_str}", 
                         model=model, 
-                        llm_provider="bedrock"
+                        llm_provider="bedrock",
+                        response=original_exception.response
                     )
                 if "Unable to locate credentials" in error_str or "The security token included in the request is invalid" in error_str:
                     exception_mapping_worked = True
                     raise AuthenticationError(
                             message=f"BedrockException Invalid Authentication - {error_str}",
                             model=model, 
-                            llm_provider="bedrock"
+                            llm_provider="bedrock",
+                            response=original_exception.response
                     )
                 if "throttlingException" in error_str or "ThrottlingException" in error_str:
                     exception_mapping_worked = True
                     raise RateLimitError(
                             message=f"BedrockException: Rate Limit Error - {error_str}",
                             model=model, 
-                            llm_provider="bedrock"
+                            llm_provider="bedrock",
+                            response=original_exception.response
                     )
                 if hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 500:
@@ -3216,33 +3343,37 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"BedrockException - {original_exception.message}",
                             llm_provider="bedrock",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
             elif custom_llm_provider == "sagemaker": 
                 if "Unable to locate credentials" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"SagemakerException - {error_str}", 
                         model=model, 
-                        llm_provider="sagemaker"
+                        llm_provider="sagemaker",
+                        response=original_exception.response
                     )
             elif custom_llm_provider == "vertex_ai":
                 if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"VertexAIException - {error_str}", 
                         model=model, 
-                        llm_provider="vertex_ai"
+                        llm_provider="vertex_ai",
+                        response=original_exception.response
                     )
             elif custom_llm_provider == "palm":
                 if "503 Getting metadata" in error_str:
                     # auth errors look like this
                     # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"PalmException - Invalid api key", 
                         model=model, 
-                        llm_provider="palm"
+                        llm_provider="palm",
+                        response=original_exception.response
                     )
                 if "400 Request payload size exceeds" in error_str:
                     exception_mapping_worked = True
@@ -3250,6 +3381,7 @@ def exception_type(
                         message=f"PalmException - {error_str}",
                         model=model,
                         llm_provider="palm",
+                        response=original_exception.response
                     )
                 # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
             elif custom_llm_provider == "cohere":  # Cohere
@@ -3261,7 +3393,8 @@ def exception_type(
                     raise AuthenticationError(
                         message=f"CohereException - {original_exception.message}",
                         llm_provider="cohere",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "too many tokens" in error_str:
                     exception_mapping_worked = True
@@ -3269,14 +3402,16 @@ def exception_type(
                         message=f"CohereException - {original_exception.message}",
                         model=model,
                         llm_provider="cohere",
+                        response=original_exception.response
                     )
                 elif hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 400 or original_exception.status_code == 498:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"CohereException - {original_exception.message}",
                             llm_provider="cohere",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 500:
                         exception_mapping_worked = True
@@ -3292,14 +3427,16 @@ def exception_type(
                     raise RateLimitError(
                         message=f"CohereException - {original_exception.message}",
                         llm_provider="cohere",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "invalid type:" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"CohereException - {original_exception.message}",
                         llm_provider="cohere",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "Unexpected server error" in error_str:
                     exception_mapping_worked = True
@@ -3315,7 +3452,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"CohereException - {original_exception.message}",
                             llm_provider="cohere",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
                     raise original_exception
             elif custom_llm_provider == "huggingface":
@@ -3324,14 +3462,16 @@ def exception_type(
                     raise ContextWindowExceededError(
                         message=error_str,
                         model=model,
-                        llm_provider="huggingface"
+                        llm_provider="huggingface",
+                        response=original_exception.response
                     )
                 elif "A valid user token is required" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=error_str, 
                         llm_provider="huggingface",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 if hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 401:
@@ -3339,28 +3479,32 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"HuggingfaceException - {original_exception.message}",
                             llm_provider="huggingface",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 400:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"HuggingfaceException - {original_exception.message}",
                             model=model,
                             llm_provider="huggingface",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
                             message=f"HuggingfaceException - {original_exception.message}",
                             model=model,
-                            llm_provider="huggingface"
+                            llm_provider="huggingface",
+                            request=original_exception.request
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"HuggingfaceException - {original_exception.message}",
                             llm_provider="huggingface",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     else:
                         exception_mapping_worked = True
@@ -3368,7 +3512,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"HuggingfaceException - {original_exception.message}",
                             llm_provider="huggingface",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
                 exception_mapping_worked = True
                 raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
@@ -3379,14 +3524,16 @@ def exception_type(
                         raise ContextWindowExceededError(
                             message=f"AI21Exception - {original_exception.message}",
                             model=model,
-                            llm_provider="ai21"
+                            llm_provider="ai21",
+                            response=original_exception.response
                         )
                     if "Bad or missing API token." in original_exception.message: 
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"AI21Exception - {original_exception.message}",
                             model=model,
-                            llm_provider="ai21"
+                            llm_provider="ai21",
+                            response=original_exception.response
                         )
                 if hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 401:
@@ -3394,27 +3541,32 @@ def exception_type(
                         raise AuthenticationError(
                             message=f"AI21Exception - {original_exception.message}",
                             llm_provider="ai21",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
                             message=f"AI21Exception - {original_exception.message}",
                             model=model,
-                            llm_provider="ai21"
+                            llm_provider="ai21",
+                            request=original_exception.request
                         )
                     if original_exception.status_code == 422:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"AI21Exception - {original_exception.message}",
                             model=model,
                             llm_provider="ai21",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"AI21Exception - {original_exception.message}",
                             llm_provider="ai21",
+                            model=model,
+                            response=original_exception.response
                         )
                     else:
                         exception_mapping_worked = True
@@ -3422,7 +3574,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"AI21Exception - {original_exception.message}",
                             llm_provider="ai21",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
             elif custom_llm_provider == "nlp_cloud":
                 if "detail" in error_str:
@@ -3431,14 +3584,16 @@ def exception_type(
                         raise ContextWindowExceededError(
                             message=f"NLPCloudException - {error_str}",
                             model=model,
-                            llm_provider="nlp_cloud"
+                            llm_provider="nlp_cloud",
+                            response=original_exception.response
                         )
                     elif "value is not a valid" in error_str:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"NLPCloudException - {error_str}",
                             model=model,
-                            llm_provider="nlp_cloud"
+                            llm_provider="nlp_cloud",
+                            response=original_exception.response
                         )
                     else: 
                         exception_mapping_worked = True
@@ -3446,35 +3601,41 @@ def exception_type(
                             status_code=500,
                             message=f"NLPCloudException - {error_str}",
                             model=model,
-                            llm_provider="nlp_cloud"
+                            llm_provider="nlp_cloud",
+                            request=original_exception.request
                         )
                 if hasattr(original_exception, "status_code"): # https://docs.nlpcloud.com/?shell#errors
                     if original_exception.status_code == 400 or original_exception.status_code == 406 or original_exception.status_code == 413 or original_exception.status_code == 422:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"NLPCloudException - {original_exception.message}",
                             llm_provider="nlp_cloud",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 401 or original_exception.status_code == 403:
                         exception_mapping_worked = True
                         raise AuthenticationError(
                             message=f"NLPCloudException - {original_exception.message}",
                             llm_provider="nlp_cloud",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 522 or original_exception.status_code == 524:
                         exception_mapping_worked = True
                         raise Timeout(
                             message=f"NLPCloudException - {original_exception.message}",
                             model=model,
-                            llm_provider="nlp_cloud"
+                            llm_provider="nlp_cloud",
+                            request=original_exception.request
                         )
                     elif original_exception.status_code == 429 or original_exception.status_code == 402:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"NLPCloudException - {original_exception.message}",
                             llm_provider="nlp_cloud",
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 500 or original_exception.status_code == 503:
                         exception_mapping_worked = True
@@ -3482,7 +3643,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"NLPCloudException - {original_exception.message}",
                             llm_provider="nlp_cloud",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
                     elif original_exception.status_code == 504 or original_exception.status_code == 520:
                         exception_mapping_worked = True
@@ -3497,7 +3659,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"NLPCloudException - {original_exception.message}",
                             llm_provider="nlp_cloud",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
             elif custom_llm_provider == "together_ai":
                 import json
@@ -3507,49 +3670,56 @@ def exception_type(
                     raise ContextWindowExceededError(
                         message=f"TogetherAIException - {error_response['error']}",
                         model=model,
-                        llm_provider="together_ai"
+                        llm_provider="together_ai",
+                        response=original_exception.response
                     )
                 elif "error" in error_response and "invalid private key" in error_response["error"]:
                     exception_mapping_worked = True
                     raise AuthenticationError(
                         message=f"TogetherAIException - {error_response['error']}",
                         llm_provider="together_ai",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"TogetherAIException - {error_response['error']}",
                         model=model,
-                        llm_provider="together_ai"
+                        llm_provider="together_ai",
+                        response=original_exception.response
                     )
                 elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"TogetherAIException - {error_response['error']}",
                         model=model,
-                        llm_provider="together_ai"
+                        llm_provider="together_ai",
+                        response=original_exception.response
                     )
                 elif "error_type" in error_response and error_response["error_type"] == "validation":
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"TogetherAIException - {error_response['error']}",
                         model=model,
-                        llm_provider="together_ai"
+                        llm_provider="together_ai",
+                        response=original_exception.response
                     )
                 elif original_exception.status_code == 408:
                         exception_mapping_worked = True
                         raise Timeout(
                             message=f"TogetherAIException - {original_exception.message}",
                             model=model,
-                            llm_provider="together_ai"
+                            llm_provider="together_ai",
+                            request=original_exception.request
                         )
                 elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"TogetherAIException - {original_exception.message}",
                             llm_provider="together_ai",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                 else: 
                     exception_mapping_worked = True
@@ -3557,7 +3727,8 @@ def exception_type(
                         status_code=original_exception.status_code, 
                         message=f"TogetherAIException - {original_exception.message}",
                         llm_provider="together_ai",
-                        model=model
+                        model=model,
+                        request=original_exception.request
                     )
             elif custom_llm_provider == "aleph_alpha":
                 if "This is longer than the model's maximum context length" in error_str:
@@ -3565,14 +3736,16 @@ def exception_type(
                     raise ContextWindowExceededError(
                         message=f"AlephAlphaException - {original_exception.message}",
                         llm_provider="aleph_alpha", 
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "InvalidToken" in error_str or "No token provided" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"AlephAlphaException - {original_exception.message}",
                         llm_provider="aleph_alpha", 
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif hasattr(original_exception, "status_code"):
                     print_verbose(f"status code: {original_exception.status_code}")
@@ -3585,17 +3758,19 @@ def exception_type(
                         )
                     elif original_exception.status_code == 400:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"AlephAlphaException - {original_exception.message}",
                             llm_provider="aleph_alpha",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
                         raise RateLimitError(
                             message=f"AlephAlphaException - {original_exception.message}",
                             llm_provider="aleph_alpha",
-                            model=model
+                            model=model,
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 500:
                         exception_mapping_worked = True
@@ -3616,10 +3791,11 @@ def exception_type(
                     error_str = str(original_exception)
                 if "no such file or directory" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                             message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
                             model=model,
-                            llm_provider="ollama"
+                            llm_provider="ollama",
+                            response=original_exception.response
                         )
                 elif "Failed to establish a new connection" in error_str: 
                     exception_mapping_worked = True
@@ -3630,10 +3806,11 @@ def exception_type(
                     )
                 elif "Invalid response object from API" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"OllamaException: {original_exception}",
                         llm_provider="ollama",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
             elif custom_llm_provider == "vllm":
                 if hasattr(original_exception, "status_code"):
@@ -3650,14 +3827,16 @@ def exception_type(
                     raise ContextWindowExceededError(
                         message=f"AzureException - {original_exception.message}",
                         llm_provider="azure",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif "invalid_request_error" in error_str:
                     exception_mapping_worked = True
-                    raise InvalidRequestError(
+                    raise BadRequestError(
                         message=f"AzureException - {original_exception.message}",
                         llm_provider="azure",
-                        model=model
+                        model=model,
+                        response=original_exception.response
                     )
                 elif hasattr(original_exception, "status_code"):
                     exception_mapping_worked = True
@@ -3673,14 +3852,16 @@ def exception_type(
                         raise Timeout(
                             message=f"AzureException - {original_exception.message}",
                             model=model,
-                            llm_provider="azure"
+                            llm_provider="azure",
+                            request=original_exception.request
                         )
                     if original_exception.status_code == 422:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"AzureException - {original_exception.message}",
                             model=model,
                             llm_provider="azure",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
@@ -3688,6 +3869,7 @@ def exception_type(
                             message=f"AzureException - {original_exception.message}",
                             model=model,
                             llm_provider="azure",
+                            response=original_exception.response
                         )
                     else:
                         exception_mapping_worked = True
@@ -3695,7 +3877,8 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"AzureException - {original_exception.message}",
                             llm_provider="azure",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
             elif custom_llm_provider == "custom_openai" or custom_llm_provider == "maritalk":
                 if hasattr(original_exception, "status_code"):
@@ -3712,14 +3895,16 @@ def exception_type(
                         raise Timeout(
                             message=f"CustomOpenAIException - {original_exception.message}",
                             model=model,
-                            llm_provider="custom_openai"
+                            llm_provider="custom_openai",
+                            request=original_exception.request
                         )
                     if original_exception.status_code == 422:
                         exception_mapping_worked = True
-                        raise InvalidRequestError(
+                        raise BadRequestError(
                             message=f"CustomOpenAIException - {original_exception.message}",
                             model=model,
                             llm_provider="custom_openai",
+                            response=original_exception.response
                         )
                     elif original_exception.status_code == 429:
                         exception_mapping_worked = True
@@ -3727,6 +3912,7 @@ def exception_type(
                             message=f"CustomOpenAIException - {original_exception.message}",
                             model=model,
                             llm_provider="custom_openai",
+                            response=original_exception.response
                         )
                     else:
                         exception_mapping_worked = True
@@ -3734,17 +3920,20 @@ def exception_type(
                             status_code=original_exception.status_code, 
                             message=f"CustomOpenAIException - {original_exception.message}",
                             llm_provider="custom_openai",
-                            model=model
+                            model=model,
+                            request=original_exception.request
                         )
         exception_mapping_worked = True
-        if "InvalidRequestError.__init__() missing 1 required positional argument: 'param'" in str(original_exception): # deal with edge-case invalid request error bug in openai-python sdk
-            raise InvalidRequestError(
+        if "BadRequestError.__init__() missing 1 required positional argument: 'param'" in str(original_exception): # deal with edge-case invalid request error bug in openai-python sdk
+            raise BadRequestError(
                 message=f"OpenAIException: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}",
                 model=model, 
-                llm_provider=custom_llm_provider
+                llm_provider=custom_llm_provider,
+                response=original_exception.response
             )
         else:
-            raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
+            # raise BadRequestError(message=str(original_exception), llm_provider=custom_llm_provider, model=model, response=original_exception.response)
+            raise original_exception
     except Exception as e:
         # LOGGING
         exception_logging(
@@ -4079,7 +4268,7 @@ class CustomStreamWrapper:
     
     def handle_openai_chat_completion_chunk(self, chunk): 
         try: 
-            str_line = chunk.decode("utf-8")  # Convert bytes to string
+            str_line = chunk
             text = "" 
             is_finished = False
             finish_reason = None
@@ -4110,11 +4299,17 @@ class CustomStreamWrapper:
 
     def handle_openai_text_completion_chunk(self, chunk):
         try: 
-            str_line = chunk.decode("utf-8")  # Convert bytes to string
+            # str_line = chunk.decode("utf-8")  # Convert bytes to string
+            str_line = chunk
             text = "" 
             is_finished = False
             finish_reason = None
-            if str_line.startswith("data:"):
+            if "data: [DONE]" in str_line:
+                text = ""
+                is_finished = True
+                finish_reason = "stop"
+                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+            elif str_line.startswith("data:"):
                 data_json = json.loads(str_line[5:])
                 print_verbose(f"delta content: {data_json['choices'][0]['text']}")
                 text = data_json["choices"][0].get("text", "") 
@@ -4194,7 +4389,6 @@ class CustomStreamWrapper:
     def chunk_creator(self, chunk):
         model_response = ModelResponse(stream=True, model=self.model)
         try:
-            
             # return this for all models
             completion_obj = {"content": ""}
             if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
@@ -4364,18 +4558,24 @@ class CustomStreamWrapper:
     ## needs to handle the empty string case (even starting chunk can be an empty string)
     def __next__(self):
         while True: # loop until a non-empty string is found
-            if isinstance(self.completion_stream, str):
-                chunk = self.completion_stream
-            else:
+            try:
+                # if isinstance(self.completion_stream, str):
+                #     chunk = self.completion_stream
+                # else:
                 chunk = next(self.completion_stream)
-            response = self.chunk_creator(chunk=chunk)
-            if response is not None:
+                response = self.chunk_creator(chunk=chunk)
+                # if response is not None:
                 return response
+            except Exception as e:
+                raise StopIteration
     
     async def __anext__(self):
         try:
-            if self.custom_llm_provider == "openai" or self.custom_llm_provider == "azure":
-                async for chunk in self.completion_stream.content:
+            if (self.custom_llm_provider == "openai" 
+                or self.custom_llm_provider == "azure"
+                or self.custom_llm_provider == "custom_openai"
+                or self.custom_llm_provider == "text-completion-openai"):
+                async for chunk in self.completion_stream:
                     if chunk == "None" or chunk is None:
                         raise Exception
                     processed_chunk = self.chunk_creator(chunk=chunk)