diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 3907a3f34..b395309b6 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index bf977f9fd..5f546c9e3 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py
index e63344492..e7c76d0ef 100644
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos
     }
     status = ""
     while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.0001)
+        time.sleep(0.0001) # prevent being rate limited by replicate
         response = requests.get(prediction_url, headers=headers)
         if response.status_code == 200:
             response_data = response.json()
+            status = response_data['status']
+            print(f"response data: {response_data}")
             if "output" in response_data:
                 output_string = "".join(response_data['output'])
                 new_output = output_string[len(previous_output):]
-                yield new_output
+                yield {"output": new_output, "status": status}
                 previous_output = output_string
             status = response_data['status']
 
diff --git a/litellm/main.py b/litellm/main.py
index 1b2789f75..e34355317 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -485,11 +485,11 @@ def completion(
             # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
             replicate_key = None
             replicate_key = (
-                get_secret("REPLICATE_API_KEY")
-                or get_secret("REPLICATE_API_TOKEN")
-                or api_key
+                api_key
                 or litellm.replicate_key
-                or litellm.api_key
+                or litellm.api_key 
+                or get_secret("REPLICATE_API_KEY")
+                or get_secret("REPLICATE_API_TOKEN")
             )
 
             model_response = replicate.completion(
@@ -575,7 +575,7 @@ def completion(
 
             if "stream" in optional_params and optional_params["stream"] == True:
                 # don't try to access stream object,
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging)
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                 return response
             response = model_response
         elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@@ -769,7 +769,7 @@ def completion(
             if stream:
                 model_response = chat.send_message_streaming(prompt, **optional_params)
                 response = CustomStreamWrapper(
-                    model_response, model, custom_llm_provider="vertexai", logging_obj=logging
+                    model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
                 )
                 return response
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index da5994ccd..980aa14f2 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -643,24 +643,6 @@ def test_completion_sagemaker():
 
 # test_completion_sagemaker()
 
-def test_completion_sagemaker_stream():
-    litellm.set_verbose = False
-    try:
-        response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-            messages=messages,
-            temperature=0.2,
-            max_tokens=80,
-            stream=True,
-        )
-        # Add any assertions here to check the response
-        for chunk in response:
-            print(chunk)
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-# test_completion_sagemaker_stream()
-
 def test_completion_bedrock_titan():
     try:
         response = completion(
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 10f772c25..495630300 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -9,7 +9,7 @@ sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import completion, acompletion
+from litellm import completion, acompletion, AuthenticationError, InvalidRequestError
 
 litellm.logging = False
 litellm.set_verbose = False
@@ -187,6 +187,7 @@ def streaming_format_tests(idx, chunk):
         finished = True
     if "content" in chunk["choices"][0]["delta"]:
         extracted_chunk = chunk["choices"][0]["delta"]["content"]
+    print(f"extracted chunk: {extracted_chunk}")
     return extracted_chunk, finished
 
 def test_completion_cohere_stream():
@@ -199,21 +200,120 @@ def test_completion_cohere_stream():
             },
         ]
         response = completion(
-            model="command-nightly", messages=messages, stream=True, max_tokens=50
+            model="command-nightly", messages=messages, stream=True, max_tokens=50,
         )
         complete_response = ""
         # Add any assertions here to check the response
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
             if finished:
                 break
             complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not in final chunk")
         if complete_response.strip() == "": 
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+# test_completion_cohere_stream()
+
+def test_completion_cohere_stream_bad_key():
+    try:
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="command-nightly", messages=messages, stream=True, max_tokens=50, api_key=api_key
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        has_finish_reason = False
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not in final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except AuthenticationError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_cohere_stream_bad_key()
+
+# def test_completion_nlp_cloud():
+#     try:
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(model="dolphin", messages=messages, stream=True)
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         has_finish_reason = False
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finish_reason is False:
+#             raise Exception("Finish reason not in final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_nlp_cloud()
+
+# def test_completion_nlp_cloud_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(model="dolphin", messages=messages, stream=True, api_key=api_key)
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         has_finish_reason = False
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finish_reason is False:
+#             raise Exception("Finish reason not in final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_nlp_cloud_bad_key()
+
 # def test_completion_hf_stream():
 #     try:
 #         messages = [
@@ -235,10 +335,41 @@ def test_completion_cohere_stream():
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e:
+#         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 
-# test_completion_hf_stream()
+# # test_completion_hf_stream()
+
+# def test_completion_hf_stream_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         messages = [
+#             {
+#                 "content": "Hello! How are you today?",
+#                 "role": "user"
+#             },
+#         ]
+#         response = completion(
+#             model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base="https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud", stream=True, max_tokens=1000, api_key=api_key
+#         )
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e:
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_hf_stream_bad_key()
 
 def test_completion_claude_stream():
     try:
@@ -266,19 +397,22 @@ def test_completion_claude_stream():
         pytest.fail(f"Error occurred: {e}")
 # test_completion_claude_stream()
 
-def test_completion_bedrock_ai21_stream():
+
+def test_completion_claude_stream_bad_key():
     try:
-        litellm.set_verbose = False
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
         response = completion(
-            model="bedrock/amazon.titan-tg1-large", 
-            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
-            temperature=1,
-            max_tokens=4096,
-            stream=True,
+            model="claude-instant-1", messages=messages, stream=True, max_tokens=50, api_key=api_key
         )
-        complete_response = "" 
-        # Add any assertions here to check the response 
-        print(response)
+        complete_response = ""
+        # Add any assertions here to check the response
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
@@ -286,11 +420,263 @@ def test_completion_bedrock_ai21_stream():
             complete_response += chunk
         if complete_response.strip() == "": 
             raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
 
-# test_completion_cohere_stream()
+# test_completion_claude_stream_bad_key() 
+
+def test_completion_replicate_stream():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_replicate_stream()
+
+# def test_completion_vertexai_stream():
+#     try:
+#         import os 
+#         os.environ["VERTEXAI_PROJECT"] = "pathrise-convert-1606954137718"
+#         os.environ["VERTEXAI_LOCATION"] = "us-central1"
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(
+#             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_vertexai_stream()
+
+
+# def test_completion_vertexai_stream_bad_key():
+#     try:
+#         import os 
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(
+#             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_vertexai_stream_bad_key()
+
+def test_completion_replicate_stream():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+def test_completion_replicate_stream_bad_key():
+    try:
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50, api_key=api_key
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            if finished:
+                break
+            complete_response += chunk
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_replicate_stream_bad_key()
+
+def test_completion_bedrock_ai21_stream():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
+            temperature=1,
+            max_tokens=4096,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_bedrock_ai21_stream() 
+
+def test_completion_bedrock_ai21_stream_bad_key():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
+            temperature=1,
+            max_tokens=4096,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_bedrock_ai21_stream_bad_key() 
+
+def test_completion_sagemaker_stream():
+    try:
+        response = completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+test_completion_sagemaker_stream()
 
 # test on openai completion call
 def test_openai_text_completion_call():
@@ -314,7 +700,33 @@ def test_openai_text_completion_call():
 def ai21_completion_call():
     try:
         response = completion(
-            model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn
+            model="j2-ultra", messages=messages, stream=True
+        )
+        print(f"response: {response}")
+        has_finished = False
+        complete_response = ""
+        start_time = time.time()
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finished = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finished is False:
+            raise Exception("finished reason missing from final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except:
+        pytest.fail(f"error occurred: {traceback.format_exc()}")
+
+# ai21_completion_call()
+
+def ai21_completion_call_bad_key():
+    try:
+        api_key = "bad-key"
+        response = completion(
+            model="j2-ultra", messages=messages, stream=True, api_key=api_key
         )
         print(f"response: {response}")
         complete_response = ""
@@ -327,10 +739,64 @@ def ai21_completion_call():
         if complete_response.strip() == "": 
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
     except:
         pytest.fail(f"error occurred: {traceback.format_exc()}")
 
-# ai21_completion_call()
+# ai21_completion_call_bad_key()
+
+def test_completion_aleph_alpha():
+    try:
+        response = completion(
+            model="luminous-base", messages=messages, stream=True
+        )
+        # Add any assertions here to check the response
+        has_finished = False
+        complete_response = ""
+        start_time = time.time()
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finished = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finished is False:
+            raise Exception("finished reason missing from final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_aleph_alpha()
+
+# def test_completion_aleph_alpha_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         response = completion(
+#             model="luminous-base", messages=messages, stream=True, api_key=api_key
+#         )
+#         # Add any assertions here to check the response
+#         has_finished = False
+#         complete_response = ""
+#         start_time = time.time()
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finished = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finished is False:
+#             raise Exception("finished reason missing from final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_aleph_alpha_bad_key()
+
 # test on openai completion call
 def test_openai_chat_completion_call():
     try:
@@ -366,11 +832,15 @@ def test_together_ai_completion_call_starcoder():
         )
         complete_response = ""
         print(f"returned response object: {response}")
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
             if finished:
                 break
             complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not set for last chunk")
         if complete_response == "":
             raise Exception("Empty response received")
         print(f"complete response: {complete_response}")
@@ -378,6 +848,38 @@ def test_together_ai_completion_call_starcoder():
         print(f"error occurred: {traceback.format_exc()}")
         pass
 
+# test_together_ai_completion_call_starcoder() 
+
+def test_together_ai_completion_call_starcoder_bad_key():
+    try:
+        api_key = "bad-key"
+        start_time = time.time()
+        response = completion(
+            model="together_ai/bigcode/starcoder",
+            messages=messages,
+            stream=True,
+            api_key=api_key
+        )
+        complete_response = ""
+        has_finish_reason = False
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not set for last chunk")
+        if complete_response == "":
+            raise Exception("Empty response received")
+        print(f"complete response: {complete_response}")
+    except InvalidRequestError as e:
+        pass
+    except:
+        print(f"error occurred: {traceback.format_exc()}")
+        pass
+
+# test_together_ai_completion_call_starcoder_bad_key() 
 #### Test Function calling + streaming ####
 
 def test_completion_openai_with_functions():
diff --git a/litellm/utils.py b/litellm/utils.py
index 0d2ce8c95..046c82cf1 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2,6 +2,7 @@ import sys
 import dotenv, json, traceback, threading
 import subprocess, os
 import litellm, openai
+import itertools
 import random, uuid, requests
 import datetime, time
 import tiktoken
@@ -1915,7 +1916,6 @@ def exception_type(
     ):
     global user_logger_fn, liteDebuggerClient
     exception_mapping_worked = False
-
     if litellm.set_verbose == True:
         litellm.error_logs['EXCEPTION'] = original_exception
         litellm.error_logs['KWARGS'] = completion_kwargs
@@ -1970,7 +1970,7 @@ def exception_type(
                 exception_type = type(original_exception).__name__
             else:
                 exception_type = ""
-            if "claude" in model:  # one of the anthropics
+            if custom_llm_provider == "anthropic":  # one of the anthropics
                 if hasattr(original_exception, "message"):
                     if "prompt is too long" in original_exception.message:
                         exception_mapping_worked = True
@@ -1979,6 +1979,13 @@ def exception_type(
                             model=model,
                             llm_provider="anthropic"
                         )
+                    if "Invalid API Key" in original_exception.message:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=original_exception.message, 
+                            model=model,
+                            llm_provider="anthropic"
+                        )
                 if hasattr(original_exception, "status_code"):
                     print_verbose(f"status_code: {original_exception.status_code}")
                     if original_exception.status_code == 401:
@@ -2031,7 +2038,7 @@ def exception_type(
                             llm_provider="anthropic",
                             model=model
                         )
-            elif "replicate" in model:
+            elif custom_llm_provider == "replicate":
                 if "Incorrect authentication token" in error_str:
                     exception_mapping_worked = True
                     raise AuthenticationError(
@@ -2068,7 +2075,7 @@ def exception_type(
                             llm_provider="replicate",
                             model=model
                         )
-                    elif original_exception.status_code == 400:
+                    elif original_exception.status_code == 400 or original_exception.status_code == 422:
                         exception_mapping_worked = True
                         raise InvalidRequestError(
                             message=f"ReplicateException - {original_exception.message}",
@@ -2110,7 +2117,31 @@ def exception_type(
                     llm_provider="replicate",
                     model=model
                 )
-            elif model in litellm.cohere_models or custom_llm_provider == "cohere":  # Cohere
+            elif custom_llm_provider == "bedrock":
+                if "Unable to locate credentials" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"BedrockException - {error_str}", 
+                        model=model, 
+                        llm_provider="bedrock"
+                    )
+            elif custom_llm_provider == "sagemaker": 
+                if "Unable to locate credentials" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"SagemakerException - {error_str}", 
+                        model=model, 
+                        llm_provider="sagemaker"
+                    )
+            elif custom_llm_provider == "vertex_ai":
+                if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"VertexAIException - {error_str}", 
+                        model=model, 
+                        llm_provider="vertex_ai"
+                    )
+            elif custom_llm_provider == "cohere":  # Cohere
                 if (
                     "invalid api token" in error_str
                     or "No API key provided." in error_str
@@ -2184,6 +2215,13 @@ def exception_type(
                         model=model,
                         llm_provider="huggingface"
                     )
+                elif "A valid user token is required" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=error_str, 
+                        llm_provider="huggingface",
+                        model=model
+                    )
                 if hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 401:
                         exception_mapping_worked = True
@@ -2221,6 +2259,8 @@ def exception_type(
                             llm_provider="huggingface",
                             model=model
                         )
+                exception_mapping_worked = True
+                raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
             elif custom_llm_provider == "ai21":
                 if hasattr(original_exception, "message"):
                     if "Prompt has too many tokens" in original_exception.message:
@@ -2230,6 +2270,13 @@ def exception_type(
                             model=model,
                             llm_provider="ai21"
                         )
+                    if "Bad or missing API token." in original_exception.message: 
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21"
+                        )
                 if hasattr(original_exception, "status_code"):
                     if original_exception.status_code == 401:
                         exception_mapping_worked = True
@@ -2266,7 +2313,7 @@ def exception_type(
                             llm_provider="ai21",
                             model=model
                         )
-            elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+            elif custom_llm_provider == "nlp_cloud":
                 if "detail" in error_str:
                     if "Input text length should not exceed" in error_str:
                         exception_mapping_worked = True
@@ -2342,6 +2389,7 @@ def exception_type(
                             model=model
                         )
             elif custom_llm_provider == "together_ai":
+                import json
                 error_response = json.loads(error_str)
                 if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
                     exception_mapping_worked = True
@@ -2364,6 +2412,13 @@ def exception_type(
                         model=model,
                         llm_provider="together_ai"
                     )
+                elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai"
+                    )
                 elif "error_type" in error_response and error_response["error_type"] == "validation":
                     exception_mapping_worked = True
                     raise InvalidRequestError(
@@ -2393,7 +2448,7 @@ def exception_type(
                         llm_provider="together_ai",
                         model=model
                     )
-            elif model in litellm.aleph_alpha_models:
+            elif custom_llm_provider == "aleph_alpha":
                 if "This is longer than the model's maximum context length" in error_str:
                     exception_mapping_worked = True
                     raise ContextWindowExceededError(
@@ -2401,6 +2456,13 @@ def exception_type(
                         llm_provider="aleph_alpha", 
                         model=model
                     )
+                elif "InvalidToken" in error_str or "No token provided" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"AlephAlphaException - {original_exception.message}",
+                        llm_provider="aleph_alpha", 
+                        model=model
+                    )
                 elif hasattr(original_exception, "status_code"):
                     print(f"status code: {original_exception.status_code}")
                     if original_exception.status_code == 401:
@@ -2445,7 +2507,8 @@ def exception_type(
             elif custom_llm_provider == "ollama":
                 if "no attribute 'async_get_ollama_response_stream" in error_str:
                     raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
-        raise original_exception
+        exception_mapping_worked = True
+        raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
     except Exception as e:
         # LOGGING
         exception_logging(
@@ -2563,6 +2626,7 @@ class CustomStreamWrapper:
         self.logging_obj = logging_obj
         self.completion_stream = completion_stream
         self.sent_first_chunk = False
+        self.sent_last_chunk = False
         if self.logging_obj:
                 # Log the type of the received item
                 self.logging_obj.post_call(str(type(completion_stream)))
@@ -2579,41 +2643,71 @@ class CustomStreamWrapper:
 
     def handle_anthropic_chunk(self, chunk):
         str_line = chunk.decode("utf-8")  # Convert bytes to string
+        print(f"str_line: {str_line}")
+        text = "" 
+        is_finished = False
+        finish_reason = None
         if str_line.startswith("data:"):
             data_json = json.loads(str_line[5:])
-            return data_json.get("completion", "")
-        return ""
+            text = data_json.get("completion", "") 
+            if data_json.get("stop_reason", None): 
+                is_finished = True
+                finish_reason = data_json["stop_reason"]
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "error" in str_line:
+            raise ValueError(f"Unable to parse response. Original response: {str_line}")
+        else:
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
 
     def handle_together_ai_chunk(self, chunk):
         chunk = chunk.decode("utf-8")
-        text_index = chunk.find('"text":"')  # this checks if text: exists
-        text_start = text_index + len('"text":"')
-        text_end = chunk.find('"}', text_start)
-        if text_index != -1 and text_end != -1:
-            extracted_text = chunk[text_start:text_end]
-            return extracted_text
+        text = "" 
+        is_finished = False
+        finish_reason = None
+        if "text" in chunk: 
+            text_index = chunk.find('"text":"')  # this checks if text: exists
+            text_start = text_index + len('"text":"')
+            text_end = chunk.find('"}', text_start)
+            if text_index != -1 and text_end != -1:
+                extracted_text = chunk[text_start:text_end]
+                text = extracted_text
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "[DONE]" in chunk:
+            return {"text": text, "is_finished": True, "finish_reason": "stop"}
+        elif "error" in chunk:
+            raise ValueError(chunk)
         else:
-            return ""
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
 
     def handle_huggingface_chunk(self, chunk):
         chunk = chunk.decode("utf-8")
+        text = "" 
+        is_finished = False
+        finish_reason = ""
         if chunk.startswith("data:"):
             data_json = json.loads(chunk[5:])
+            print(f"data json: {data_json}")
             if "token" in data_json and "text" in data_json["token"]:
                 text = data_json["token"]["text"]
                 if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
                     if any(token in text for token in llama_2_special_tokens):
                         text = text.replace("<s>", "").replace("</s>", "")
-                return text
-            else:
-                return ""
-        return ""
+            if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
+                is_finished = True
+                finish_reason = data_json["details"]["finish_reason"]
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "error" in chunk: 
+            raise ValueError(chunk)
+        return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
     
-    def handle_ai21_chunk(self, chunk):
+    def handle_ai21_chunk(self, chunk): # fake streaming
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
         try:
-            return data_json["completions"][0]["data"]["text"]
+            text = data_json["completions"][0]["data"]["text"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
@@ -2621,8 +2715,10 @@ class CustomStreamWrapper:
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
         try:
-            print(f"data json: {data_json}")
-            return data_json["generated_text"]
+            text = data_json["generated_text"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
@@ -2630,7 +2726,10 @@ class CustomStreamWrapper:
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
         try:
-            return data_json["completions"][0]["completion"]
+            text = data_json["completions"][0]["completion"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
@@ -2638,7 +2737,35 @@ class CustomStreamWrapper:
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
         try:
-            return data_json["text"]
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "text" in data_json: 
+                text = data_json["text"]
+            elif "is_finished" in data_json: 
+                is_finished = data_json["is_finished"]
+                finish_reason = data_json["finish_reason"]
+            else: 
+                raise Exception(data_json)
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        except:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+    
+    def handle_replicate_chunk(self, chunk):
+        print(f"chunk: {chunk}")
+        try:
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "output" in chunk: 
+                text = chunk['output']
+            if "status" in chunk: 
+                if chunk["status"] == "succeeded":
+                    is_finished = True
+                    finish_reason = "stop"
+            elif chunk.get("error", None): 
+                raise Exception(chunk["error"])
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
@@ -2683,13 +2810,21 @@ class CustomStreamWrapper:
             traceback.print_exc()
             return ""
 
-    def handle_bedrock_stream(self):
-        if self.completion_stream:
-            event = next(self.completion_stream)
-            chunk = event.get('chunk')
-            if chunk:
-                chunk_data = json.loads(chunk.get('bytes').decode())
-                return chunk_data['outputText']
+    def handle_bedrock_stream(self, chunk):
+        chunk = chunk.get('chunk')
+        if chunk:
+            chunk_data = json.loads(chunk.get('bytes').decode())
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "outputText" in chunk_data: 
+                text = chunk_data['outputText']
+            if chunk_data.get("completionReason", None): 
+                is_finished = True
+                finish_reason = chunk_data["completionReason"]
+            elif chunk.get("error", None): 
+                raise Exception(chunk["error"])
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
         return ""
 
     ## needs to handle the empty string case (even starting chunk can be an empty string)
@@ -2701,49 +2836,94 @@ class CustomStreamWrapper:
                 completion_obj = {"content": ""}
                 if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_anthropic_chunk(chunk)
+                    response_obj = self.handle_anthropic_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = chunk
+                    response_obj = self.handle_replicate_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif (
                     self.custom_llm_provider and self.custom_llm_provider == "together_ai"):
                     chunk = next(self.completion_stream)
-                    text_data = self.handle_together_ai_chunk(chunk)
-                    if text_data == "":
-                        return self.__next__()
-                    completion_obj["content"] = text_data
+                    response_obj = self.handle_together_ai_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_huggingface_chunk(chunk)
+                    response_obj = self.handle_huggingface_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming
                     chunk = next(self.completion_stream)
                     completion_obj["content"] = self.handle_baseten_chunk(chunk)
                 elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_ai21_chunk(chunk)
+                    response_obj = self.handle_ai21_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                     chunk = next(self.completion_stream)
                     completion_obj["content"] = chunk[0].outputs[0].text
-                elif self.custom_llm_provider and self.custom_llm_provider == "aleph-alpha": #aleph alpha doesn't provide streaming
+                elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk)
+                    response_obj = self.handle_aleph_alpha_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
                     chunk = next(self.completion_stream)
                     completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
                 elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
-                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
-                elif self.model in (litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models):
-                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = str(chunk)
+                    try: 
+                        chunk = next(self.completion_stream)
+                        response_obj = self.handle_nlp_cloud_chunk(chunk)
+                        completion_obj["content"] = response_obj["text"]
+                        if response_obj["is_finished"]: 
+                            model_response.choices[0].finish_reason = response_obj["finish_reason"]
+                    except Exception as e:
+                        if self.sent_last_chunk:
+                            raise e
+                        else:
+                            if self.sent_first_chunk is False: 
+                                raise Exception("An unknown error occurred with the stream")
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
+                elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
+                    try:
+                        chunk = next(self.completion_stream)
+                        completion_obj["content"] = str(chunk)
+                    except StopIteration as e:
+                        if self.sent_last_chunk: 
+                            raise e 
+                        else:
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
                 elif self.custom_llm_provider == "cohere":
                     chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_cohere_chunk(chunk)
+                    response_obj = self.handle_cohere_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider == "bedrock":
-                    completion_obj["content"] = self.handle_bedrock_stream()
+                    chunk = next(self.completion_stream)
+                    response_obj = self.handle_bedrock_stream(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider == "sagemaker":
                     if len(self.completion_stream)==0:
-                        raise StopIteration
+                        if self.sent_last_chunk: 
+                            raise StopIteration
+                        else:
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
                     chunk_size = 30
                     new_chunk = self.completion_stream[:chunk_size]
                     completion_obj["content"] = new_chunk
@@ -2765,11 +2945,13 @@ class CustomStreamWrapper:
                         self.sent_first_chunk = True
                     model_response.choices[0].delta = Delta(**completion_obj)
                     return model_response
+                elif model_response.choices[0].finish_reason:
+                    return model_response
         except StopIteration:
             raise StopIteration
-        except Exception as e:
-            model_response.choices[0].finish_reason = "stop"
-            return model_response
+        except Exception as e: 
+            e.message = str(e)
+            return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e)
     
     async def __anext__(self):
         try:
@@ -2796,7 +2978,6 @@ def read_config_args(config_path) -> dict:
         # read keys/ values from config file and return them
         return config
     except Exception as e:
-        print("An error occurred while reading config:", str(e))
         raise e
 
 ########## experimental completion variants ############################
@@ -2899,7 +3080,6 @@ def get_model_split_test(models, completion_call_id):
     try:
         # make the api call
         last_fetched_at = time.time()
-        print(f"last_fetched_at: {last_fetched_at}")
         response = requests.post(
             #http://api.litellm.ai
             url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict
diff --git a/pyproject.toml b/pyproject.toml
index bf0f0097c..80b3eb99d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.738"
+version = "0.1.739"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"