diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index 67bbd33f4..32d5c828d 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Optional, Union, Any
 import types, requests
 from .base import BaseLLM
 from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, convert_to_model_response_object
@@ -98,6 +98,7 @@ class AzureChatCompletion(BaseLLM):
                api_type: str,
                azure_ad_token: str,
                print_verbose: Callable,
+               timeout,
                logging_obj,
                optional_params,
                litellm_params,
@@ -129,13 +130,13 @@ class AzureChatCompletion(BaseLLM):
             )
             if acompletion is True: 
                 if optional_params.get("stream", False):
-                    return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, model=model, api_key=api_key, api_version=api_version, azure_ad_token=azure_ad_token)
+                    return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, model=model, api_key=api_key, api_version=api_version, azure_ad_token=azure_ad_token, timeout=timeout)
                 else:
-                    return self.acompletion(api_base=api_base, data=data, model_response=model_response, api_key=api_key, api_version=api_version, model=model, azure_ad_token=azure_ad_token)
+                    return self.acompletion(api_base=api_base, data=data, model_response=model_response, api_key=api_key, api_version=api_version, model=model, azure_ad_token=azure_ad_token, timeout=timeout)
             elif "stream" in optional_params and optional_params["stream"] == True:
-                return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, model=model, api_key=api_key, api_version=api_version, azure_ad_token=azure_ad_token)
+                return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, model=model, api_key=api_key, api_version=api_version, azure_ad_token=azure_ad_token, timeout=timeout)
             else:
-                azure_client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.client_session)
+                azure_client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.client_session, timeout=timeout)
                 response = azure_client.chat.completions.create(**data) # type: ignore
                 return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response)
         except AzureOpenAIError as e: 
@@ -150,16 +151,18 @@ class AzureChatCompletion(BaseLLM):
                           model: str, 
                           api_base: str, 
                           data: dict, 
+                          timeout: Any,
                           model_response: ModelResponse,
                           azure_ad_token: Optional[str]=None, ): 
+       response = None
        try:
-            azure_client = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.aclient_session)
+            azure_client = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.aclient_session, timeout=timeout)
             response = await azure_client.chat.completions.create(**data) 
             return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response)
        except Exception as e: 
            if isinstance(e,httpx.TimeoutException):
                 raise AzureOpenAIError(status_code=500, message="Request Timeout Error")
-           elif response and hasattr(response, "text"):
+           elif response is not None and hasattr(response, "text"):
                 raise AzureOpenAIError(status_code=500, message=f"{str(e)}\n\nOriginal Response: {response.text}")
            else: 
                 raise AzureOpenAIError(status_code=500, message=f"{str(e)}")
@@ -171,9 +174,10 @@ class AzureChatCompletion(BaseLLM):
                   api_version: str, 
                   data: dict, 
                   model: str,
+                  timeout: Any,
                   azure_ad_token: Optional[str]=None, 
     ):
-        azure_client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.client_session)
+        azure_client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.client_session, timeout=timeout)
         response = azure_client.chat.completions.create(**data)
         streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="azure",logging_obj=logging_obj)
         for transformed_chunk in streamwrapper:
@@ -186,8 +190,9 @@ class AzureChatCompletion(BaseLLM):
                           api_version: str, 
                           data: dict, 
                           model: str,
+                          timeout: Any,
                           azure_ad_token: Optional[str]=None):
-        azure_client = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.aclient_session)
+        azure_client = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_base, azure_deployment=model, azure_ad_token=azure_ad_token, http_client=litellm.aclient_session, timeout=timeout)
         response = await azure_client.chat.completions.create(**data)
         streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="azure",logging_obj=logging_obj)
         async for transformed_chunk in streamwrapper:
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index f69f5a94b..27f0c3d92 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Optional, Union, Any
 import types, time, json
 import httpx
 from .base import BaseLLM
@@ -161,6 +161,7 @@ class OpenAIChatCompletion(BaseLLM):
 
     def completion(self, 
                 model_response: ModelResponse,
+                timeout: Any, 
                 model: Optional[str]=None,
                 messages: Optional[list]=None,
                 print_verbose: Optional[Callable]=None,
@@ -180,6 +181,9 @@ class OpenAIChatCompletion(BaseLLM):
             if model is None or messages is None:
                 raise OpenAIError(status_code=422, message=f"Missing model or messages")
             
+            if not isinstance(timeout, float):
+                raise OpenAIError(status_code=422, message=f"Timeout needs to be a float")
+
             for _ in range(2): # if call fails due to alternating messages, retry with reformatted message
                 data = {
                     "model": model,
@@ -197,13 +201,13 @@ class OpenAIChatCompletion(BaseLLM):
                 try: 
                     if acompletion is True: 
                         if optional_params.get("stream", False):
-                            return self.async_streaming(logging_obj=logging_obj, data=data, model=model, api_base=api_base, api_key=api_key)
+                            return self.async_streaming(logging_obj=logging_obj, data=data, model=model, api_base=api_base, api_key=api_key, timeout=timeout)
                         else:
-                            return self.acompletion(data=data, model_response=model_response, api_base=api_base, api_key=api_key)
+                            return self.acompletion(data=data, model_response=model_response, api_base=api_base, api_key=api_key, timeout=timeout)
                     elif optional_params.get("stream", False):
-                        return self.streaming(logging_obj=logging_obj, data=data, model=model, api_base=api_base, api_key=api_key)
+                        return self.streaming(logging_obj=logging_obj, data=data, model=model, api_base=api_base, api_key=api_key, timeout=timeout)
                     else:
-                        openai_client = OpenAI(api_key=api_key, base_url=api_base, http_client=litellm.client_session)
+                        openai_client = OpenAI(api_key=api_key, base_url=api_base, http_client=litellm.client_session, timeout=timeout)
                         response = openai_client.chat.completions.create(**data) # type: ignore
                         return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response)
                 except Exception as e:
@@ -234,27 +238,32 @@ class OpenAIChatCompletion(BaseLLM):
     async def acompletion(self, 
                           data: dict, 
                           model_response: ModelResponse, 
+                          timeout: float,
                           api_key: Optional[str]=None,
                           api_base: Optional[str]=None): 
         response = None
         try: 
-            openai_aclient = AsyncOpenAI(api_key=api_key, base_url=api_base, http_client=litellm.aclient_session)
+            openai_aclient = AsyncOpenAI(api_key=api_key, base_url=api_base, http_client=litellm.aclient_session, timeout=timeout)
             response = await openai_aclient.chat.completions.create(**data)
             return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response)
         except Exception as e: 
             if response and hasattr(response, "text"):
                 raise OpenAIError(status_code=500, message=f"{str(e)}\n\nOriginal Response: {response.text}")
             else: 
-                raise OpenAIError(status_code=500, message=f"{str(e)}")
+                if type(e).__name__ == "ReadTimeout": 
+                    raise OpenAIError(status_code=408, message=f"{type(e).__name__}")
+                else:
+                    raise OpenAIError(status_code=500, message=f"{str(e)}")
 
     def streaming(self,
                   logging_obj,
+                  timeout: float,
                   data: dict, 
                   model: str,
                   api_key: Optional[str]=None,
                   api_base: Optional[str]=None
     ):
-        openai_client = OpenAI(api_key=api_key, base_url=api_base, http_client=litellm.client_session)
+        openai_client = OpenAI(api_key=api_key, base_url=api_base, http_client=litellm.client_session, timeout=timeout)
         response = openai_client.chat.completions.create(**data)
         streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
         for transformed_chunk in streamwrapper:
@@ -262,16 +271,27 @@ class OpenAIChatCompletion(BaseLLM):
 
     async def async_streaming(self, 
                           logging_obj,
+                          timeout: float,
                           data: dict, 
                           model: str,
                           api_key: Optional[str]=None,
                           api_base: Optional[str]=None):
-        openai_aclient = AsyncOpenAI(api_key=api_key, base_url=api_base, http_client=litellm.aclient_session)
-        response = await openai_aclient.chat.completions.create(**data)
-        streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
-        async for transformed_chunk in streamwrapper:
-            yield transformed_chunk
-
+        response = None
+        try: 
+            openai_aclient = AsyncOpenAI(api_key=api_key, base_url=api_base, http_client=litellm.aclient_session, timeout=timeout)
+            response = await openai_aclient.chat.completions.create(**data)
+            streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
+            async for transformed_chunk in streamwrapper:
+                yield transformed_chunk
+        except Exception as e: # need to exception handle here. async exceptions don't get caught in sync functions. 
+            if response is not None and hasattr(response, "text"):
+                raise OpenAIError(status_code=500, message=f"{str(e)}\n\nOriginal Response: {response.text}")
+            else:
+                if type(e).__name__ == "ReadTimeout": 
+                    raise OpenAIError(status_code=408, message=f"{type(e).__name__}")
+                else:
+                    raise OpenAIError(status_code=500, message=f"{str(e)}")
+                
     def embedding(self,
                 model: str,
                 input: list,
diff --git a/litellm/main.py b/litellm/main.py
index 5bb08cc99..b6056fda8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -10,7 +10,6 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
@@ -18,7 +17,6 @@ import litellm
 from litellm import (  # type: ignore
     client,
     exception_type,
-    timeout,
     get_optional_params,
     get_litellm_params,
     Logging,
@@ -176,28 +174,28 @@ async def acompletion(*args, **kwargs):
                 init_response = completion(*args, **kwargs)
                 if isinstance(init_response, dict) or isinstance(init_response, ModelResponse): ## CACHING SCENARIO 
                     response = init_response
-                else:
+                elif asyncio.iscoroutine(init_response):
                     response = await init_response
         else: 
             # Call the synchronous function using run_in_executor
             response =  await loop.run_in_executor(None, func_with_context)
         if kwargs.get("stream", False): # return an async generator
-            # do not change this
-            # for stream = True, always return an async generator
-            # See OpenAI acreate https://github.com/openai/openai-python/blob/5d50e9e3b39540af782ca24e65c290343d86e1a9/openai/api_resources/abstract/engine_api_resource.py#L193
-            # return response
-            return(
-                line
-                async for line in response
-            )
+            return _async_streaming(response=response, model=model, custom_llm_provider=custom_llm_provider, args=args)
         else: 
             return response
     except Exception as e: 
-        ## Map to OpenAI Exception
         raise exception_type(
                 model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
             )
 
+async def _async_streaming(response, model, custom_llm_provider, args): 
+    try: 
+        async for line in response: 
+            yield line
+    except Exception as e: 
+        raise exception_type(
+                model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
+            )
 
 def mock_completion(model: str, messages: List, stream: Optional[bool] = False, mock_response: str = "This is a mock request", **kwargs):
     """
@@ -245,6 +243,7 @@ def completion(
     messages: List = [],
     functions: List = [],
     function_call: str = "",  # optional params
+    timeout: Union[float, int] = 600.0,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     n: Optional[int] = None,
@@ -261,7 +260,6 @@ def completion(
     tools: Optional[List] = None,
     tool_choice: Optional[str] = None,
     deployment_id = None,
-
     # set api_base, api_version, api_key
     base_url: Optional[str] = None,
     api_version: Optional[str] = None,
@@ -298,9 +296,8 @@ def completion(
 
         LITELLM Specific Params
         mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
-        force_timeout (int, optional): The maximum execution time in seconds for the completion request (default is 600).
         custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
-        num_retries (int, optional): The number of retries to attempt (default is 0).
+        max_retries (int, optional): The number of retries to attempt (default is 0).
     Returns:
         ModelResponse: A response object containing the generated completion and associated metadata.
 
@@ -314,7 +311,7 @@ def completion(
     api_base = kwargs.get('api_base', None)
     return_async = kwargs.get('return_async', False)
     mock_response = kwargs.get('mock_response', None)
-    force_timeout= kwargs.get('force_timeout', 600)
+    force_timeout= kwargs.get('force_timeout', 600) ## deprecated
     logger_fn = kwargs.get('logger_fn', None)
     verbose = kwargs.get('verbose', False)
     custom_llm_provider = kwargs.get('custom_llm_provider', None)
@@ -338,8 +335,10 @@ def completion(
     litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers", "model_list", "num_retries", "context_window_fallback_dict", "roles", "final_prompt_value", "bos_token", "eos_token", "request_timeout", "complete_response", "self", "max_retries"]
     default_params = openai_params + litellm_params
     non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
+    
     if mock_response:
         return mock_completion(model, messages, stream=stream, mock_response=mock_response)
+    timeout = float(timeout)
     try:
         if base_url: 
             api_base = base_url
@@ -486,7 +485,8 @@ def completion(
                 litellm_params=litellm_params,
                 logger_fn=logger_fn,
                 logging_obj=logging, 
-                acompletion=acompletion
+                acompletion=acompletion, 
+                timeout=timeout
             )
 
             ## LOGGING
@@ -552,7 +552,8 @@ def completion(
                     logging_obj=logging,
                     optional_params=optional_params,
                     litellm_params=litellm_params,
-                    logger_fn=logger_fn
+                    logger_fn=logger_fn,
+                    timeout=timeout
                 )
             except Exception as e:
                 ## LOGGING - log the original exception returned
@@ -1399,6 +1400,24 @@ def completion_with_retries(*args, **kwargs):
         retryer = tenacity.Retrying(wait=tenacity.wait_exponential(multiplier=1, max=10), stop=tenacity.stop_after_attempt(num_retries), reraise=True)
     return retryer(original_function, *args, **kwargs)
 
+async def acompletion_with_retries(*args, **kwargs):
+    """
+    Executes a litellm.completion() with 3 retries
+    """
+    try:
+        import tenacity
+    except Exception as e:
+        raise Exception(f"tenacity import failed please run `pip install tenacity`. Error{e}")
+    
+    num_retries = kwargs.pop("num_retries", 3)
+    retry_strategy = kwargs.pop("retry_strategy", "constant_retry")
+    original_function = kwargs.pop("original_function", completion)
+    if retry_strategy == "constant_retry": 
+        retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(num_retries), reraise=True)
+    elif retry_strategy == "exponential_backoff_retry": 
+        retryer = tenacity.Retrying(wait=tenacity.wait_exponential(multiplier=1, max=10), stop=tenacity.stop_after_attempt(num_retries), reraise=True)
+    return await retryer(original_function, *args, **kwargs)
+
 
 
 def batch_completion(
@@ -1639,9 +1658,6 @@ async def aembedding(*args, **kwargs):
     return response
 
 @client
-@timeout(  # type: ignore
-    60
-)  ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def embedding(
     model, 
     input=[], 
diff --git a/litellm/proxy/config.yaml b/litellm/proxy/config.yaml
index 2a71e5a6a..4bd0f42f1 100644
--- a/litellm/proxy/config.yaml
+++ b/litellm/proxy/config.yaml
@@ -25,5 +25,4 @@ model_list:
 
 litellm_settings:
   drop_params: True
-  success_callback: ["langfuse"] # https://docs.litellm.ai/docs/observability/langfuse_integration
-
+  success_callback: ["langfuse"] # https://docs.litellm.ai/docs/observability/langfuse_integration
\ No newline at end of file
diff --git a/litellm/router.py b/litellm/router.py
index 8ec55b313..1acc3ffd5 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -33,7 +33,7 @@ class Router:
                  redis_port: Optional[int] = None,
                  redis_password: Optional[str] = None,
                  cache_responses: bool = False,
-                 num_retries: Optional[int] = None,
+                 num_retries: int = 0,
                  timeout: float = 600,
                  default_litellm_params = {}, # default params for Router.chat.completion.create 
                  routing_strategy: Literal["simple-shuffle", "least-busy"] = "simple-shuffle") -> None:
@@ -42,12 +42,13 @@ class Router:
             self.set_model_list(model_list)
             self.healthy_deployments: List = self.model_list
         
-        if num_retries: 
-            self.num_retries = num_retries
+        self.num_retries = num_retries
         
         self.chat = litellm.Chat(params=default_litellm_params)
 
-        litellm.request_timeout = timeout
+        self.default_litellm_params = {
+            "timeout": timeout
+        }
         self.routing_strategy = routing_strategy
         ### HEALTH CHECK THREAD ###
         if self.routing_strategy == "least-busy":
@@ -222,6 +223,9 @@ class Router:
         # pick the one that is available (lowest TPM/RPM)
         deployment = self.get_available_deployment(model=model, messages=messages)
         data = deployment["litellm_params"]
+        for k, v in self.default_litellm_params.items(): 
+            if k not in data: # prioritize model-specific params > default router params 
+                data[k] = v
         return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
 
 
@@ -234,16 +238,20 @@ class Router:
         try: 
             deployment = self.get_available_deployment(model=model, messages=messages)
             data = deployment["litellm_params"]
+            for k, v in self.default_litellm_params.items(): 
+                if k not in data: # prioritize model-specific params > default router params 
+                    data[k] = v
             response = await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
-            if inspect.iscoroutinefunction(response): # async errors are often returned as coroutines 
-                response = await response
             return response
         except Exception as e: 
-            kwargs["model"] = model
-            kwargs["messages"] = messages
-            kwargs["original_exception"] = e
-            kwargs["original_function"] = self.acompletion
-            return await self.async_function_with_retries(**kwargs)
+            if self.num_retries > 0:
+                kwargs["model"] = model
+                kwargs["messages"] = messages
+                kwargs["original_exception"] = e
+                kwargs["original_function"] = self.acompletion
+                return await self.async_function_with_retries(**kwargs)
+            else: 
+                raise e
 
     def text_completion(self,
                         model: str,
@@ -258,6 +266,9 @@ class Router:
         deployment = self.get_available_deployment(model=model, messages=messages)
 
         data = deployment["litellm_params"]
+        for k, v in self.default_litellm_params.items(): 
+            if k not in data: # prioritize model-specific params > default router params 
+                data[k] = v
         # call via litellm.completion()
         return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs}) # type: ignore
 
@@ -270,6 +281,9 @@ class Router:
         deployment = self.get_available_deployment(model=model, input=input)
 
         data = deployment["litellm_params"]
+        for k, v in self.default_litellm_params.items(): 
+            if k not in data: # prioritize model-specific params > default router params 
+                data[k] = v
         # call via litellm.embedding()
         return litellm.embedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})
 
@@ -282,4 +296,7 @@ class Router:
         deployment = self.get_available_deployment(model=model, input=input)
 
         data = deployment["litellm_params"]
+        for k, v in self.default_litellm_params.items(): 
+            if k not in data: # prioritize model-specific params > default router params 
+                data[k] = v
         return await litellm.aembedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})
\ No newline at end of file
diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py
index efd017c41..e47086526 100644
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@@ -5,8 +5,6 @@ import sys, os
 import pytest
 import traceback
 import asyncio, logging
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
 
 sys.path.insert(
     0, os.path.abspath("../..")
@@ -20,7 +18,8 @@ def test_sync_response():
     user_message = "Hello, how are you?"
     messages = [{"content": user_message, "role": "user"}]
     try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, api_key=os.environ["OPENAI_API_KEY"])
+        response = completion(model="gpt-3.5-turbo", messages=messages, timeout=5)
+        print(f"response: {response}")
     except Exception as e:
         pytest.fail(f"An exception occurred: {e}")
 # test_sync_response()
@@ -30,7 +29,7 @@ def test_sync_response_anyscale():
     user_message = "Hello, how are you?"
     messages = [{"content": user_message, "role": "user"}]
     try:
-        response = completion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages)
+        response = completion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages, timeout=5)
     except Exception as e:
         pytest.fail(f"An exception occurred: {e}")
 
@@ -43,10 +42,13 @@ def test_async_response_openai():
         user_message = "Hello, how are you?"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="gpt-3.5-turbo", messages=messages)
+            response = await acompletion(model="gpt-3.5-turbo", messages=messages, timeout=5)
             print(f"response: {response}")
+        except litellm.Timeout as e: 
+            pass
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
+            print(e)
 
     asyncio.run(test_get_response())
 
@@ -56,17 +58,18 @@ def test_async_response_azure():
     import asyncio
     litellm.set_verbose = True
     async def test_get_response():
-        user_message = "Hello, how are you?"
+        user_message = "What do you know?"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="azure/chatgpt-v-2", messages=messages)
+            response = await acompletion(model="azure/chatgpt-v-2", messages=messages, timeout=5)
             print(f"response: {response}")
+        except litellm.Timeout as e: 
+            pass
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
 
     asyncio.run(test_get_response())
 
-
 def test_async_anyscale_response():
     import asyncio
     litellm.set_verbose = True
@@ -74,9 +77,11 @@ def test_async_anyscale_response():
         user_message = "Hello, how are you?"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages)
+            response = await acompletion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages, timeout=5)
             # response = await response
             print(f"response: {response}")
+        except litellm.Timeout as e: 
+            pass
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
 
@@ -91,7 +96,7 @@ def test_get_response_streaming():
         messages = [{"content": user_message, "role": "user"}]
         try:
             litellm.set_verbose = True
-            response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True)
+            response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True, timeout=5)
             print(type(response))
 
             import inspect
@@ -108,12 +113,12 @@ def test_get_response_streaming():
             assert isinstance(output, str), "output needs to be of type str"
             assert len(output) > 0, "Length of output needs to be greater than 0."
             print(f'output: {output}')
+        except litellm.Timeout as e: 
+            pass
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
-        return response
     asyncio.run(test_async_call())
 
-
 # test_get_response_streaming()
 
 def test_get_response_non_openai_streaming():
@@ -123,7 +128,7 @@ def test_get_response_non_openai_streaming():
         user_message = "Hello, how are you?"
         messages = [{"content": user_message, "role": "user"}]
         try:
-            response = await acompletion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages, stream=True)
+            response = await acompletion(model="anyscale/mistralai/Mistral-7B-Instruct-v0.1", messages=messages, stream=True, timeout=5)
             print(type(response))
 
             import inspect
@@ -140,10 +145,11 @@ def test_get_response_non_openai_streaming():
             assert output is not None, "output cannot be None."
             assert isinstance(output, str), "output needs to be of type str"
             assert len(output) > 0, "Length of output needs to be greater than 0."
-
+        except litellm.Timeout as e: 
+            pass
         except Exception as e:
             pytest.fail(f"An exception occurred: {e}")
         return response
     asyncio.run(test_async_call())
 
-# test_get_response_non_openai_streaming()
\ No newline at end of file
+test_get_response_non_openai_streaming()
\ No newline at end of file
diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py
index deae9ab39..c01f6680a 100644
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@@ -242,11 +242,11 @@ def test_acompletion_on_router():
 		]
 
 		messages = [
-			{"role": "user", "content": "What is the weather like in Boston?"}
+			{"role": "user", "content": "What is the weather like in SF?"}
 		]
 
 		async def get_response(): 
-			router = Router(model_list=model_list, redis_host=os.environ["REDIS_HOST"], redis_password=os.environ["REDIS_PASSWORD"], redis_port=os.environ["REDIS_PORT"], cache_responses=True, timeout=10)
+			router = Router(model_list=model_list, redis_host=os.environ["REDIS_HOST"], redis_password=os.environ["REDIS_PASSWORD"], redis_port=os.environ["REDIS_PORT"], cache_responses=True, timeout=0.1)
 			response1 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
 			print(f"response1: {response1}")
 			response2 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
diff --git a/litellm/utils.py b/litellm/utils.py
index 808dc3323..c32e05c3e 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -18,7 +18,7 @@ import tiktoken
 import uuid
 import aiohttp
 import logging
-import asyncio, httpx
+import asyncio, httpx, inspect
 import copy
 from tokenizers import Tokenizer
 from dataclasses import (
@@ -1047,7 +1047,6 @@ def exception_logging(
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def client(original_function):
     global liteDebuggerClient, get_all_keys
-    import inspect
     def function_setup(
         start_time, *args, **kwargs
     ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
@@ -1333,13 +1332,13 @@ def client(original_function):
                         kwargs["retry_strategy"] = "exponential_backoff_retry"
                     elif (isinstance(e, openai.APIError)): # generic api error
                         kwargs["retry_strategy"] = "constant_retry"
-                    return litellm.completion_with_retries(*args, **kwargs)
+                    return await litellm.acompletion_with_retries(*args, **kwargs)
                 elif isinstance(e, litellm.exceptions.ContextWindowExceededError) and context_window_fallback_dict and model in context_window_fallback_dict:
                     if len(args) > 0:
                         args[0]  = context_window_fallback_dict[model]
                     else:
                         kwargs["model"] = context_window_fallback_dict[model]
-                    return original_function(*args, **kwargs)
+                    return await original_function(*args, **kwargs)
             traceback_exception = traceback.format_exc()
             crash_reporting(*args, **kwargs, exception=traceback_exception)
             end_time = datetime.datetime.now()
@@ -3370,7 +3369,7 @@ def exception_type(
             else:
                 exception_type = ""
             
-            if "Request Timeout Error" in error_str: 
+            if "Request Timeout Error" in error_str or "Request timed out" in error_str: 
                 exception_mapping_worked = True
                 raise Timeout(
                     message=f"APITimeoutError - Request timed out",
@@ -3411,7 +3410,6 @@ def exception_type(
                             message=f"OpenAIException - {original_exception.message}",
                             model=model,
                             llm_provider="openai",
-                            request=original_exception.request
                         )
                     if original_exception.status_code == 422:
                         exception_mapping_worked = True
@@ -4229,7 +4227,7 @@ def exception_type(
                 llm_provider=custom_llm_provider,
                 response=original_exception.response
             )
-        else: # ensure generic errors always return APIConnectionError
+        else: # ensure generic errors always return APIConnectionError=
             exception_mapping_worked = True
             if hasattr(original_exception, "request"):
                 raise APIConnectionError(