fix exception mapping for streaming

2025-04-24 18:24:20 +00:00 · 2023-09-23 15:04:34 -07:00 · 2023-09-23 15:04:34 -07:00 · 889679a0dd
commit 889679a0dd
parent f984e5f380
8 changed files with 766 additions and 100 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos
    }
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.0001)
+        time.sleep(0.0001) # prevent being rate limited by replicate
        response = requests.get(prediction_url, headers=headers)
        if response.status_code == 200:
            response_data = response.json()
+            status = response_data['status']
+            print(f"response data: {response_data}")
            if "output" in response_data:
                output_string = "".join(response_data['output'])
                new_output = output_string[len(previous_output):]
-                yield new_output
+                yield {"output": new_output, "status": status}
                previous_output = output_string
            status = response_data['status']

--- a/litellm/main.py
+++ b/litellm/main.py
@ -485,11 +485,11 @@ def completion(
            # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
            replicate_key = None
            replicate_key = (
-                get_secret("REPLICATE_API_KEY")
-                or get_secret("REPLICATE_API_TOKEN")
-                or api_key
+                api_key
                or litellm.replicate_key
-                or litellm.api_key
+                or litellm.api_key 
+                or get_secret("REPLICATE_API_KEY")
+                or get_secret("REPLICATE_API_TOKEN")
            )

            model_response = replicate.completion(
@ -575,7 +575,7 @@ def completion(

            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging)
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                return response
            response = model_response
        elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@ -769,7 +769,7 @@ def completion(
            if stream:
                model_response = chat.send_message_streaming(prompt, **optional_params)
                response = CustomStreamWrapper(
-                    model_response, model, custom_llm_provider="vertexai", logging_obj=logging
+                    model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
                )
                return response

--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -643,24 +643,6 @@ def test_completion_sagemaker():

 # test_completion_sagemaker()

-def test_completion_sagemaker_stream():
-    litellm.set_verbose = False
-    try:
-        response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-            messages=messages,
-            temperature=0.2,
-            max_tokens=80,
-            stream=True,
-        )
-        # Add any assertions here to check the response
-        for chunk in response:
-            print(chunk)
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-# test_completion_sagemaker_stream()
-
 def test_completion_bedrock_titan():
    try:
        response = completion(
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -9,7 +9,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import completion, acompletion
+from litellm import completion, acompletion, AuthenticationError, InvalidRequestError

 litellm.logging = False
 litellm.set_verbose = False
@ -187,6 +187,7 @@ def streaming_format_tests(idx, chunk):
        finished = True
    if "content" in chunk["choices"][0]["delta"]:
        extracted_chunk = chunk["choices"][0]["delta"]["content"]
+    print(f"extracted chunk: {extracted_chunk}")
    return extracted_chunk, finished

 def test_completion_cohere_stream():
@ -199,21 +200,120 @@ def test_completion_cohere_stream():
            },
        ]
        response = completion(
-            model="command-nightly", messages=messages, stream=True, max_tokens=50
+            model="command-nightly", messages=messages, stream=True, max_tokens=50,
        )
        complete_response = ""
        # Add any assertions here to check the response
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not in final chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

+# test_completion_cohere_stream()
+
+def test_completion_cohere_stream_bad_key():
+    try:
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="command-nightly", messages=messages, stream=True, max_tokens=50, api_key=api_key
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        has_finish_reason = False
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not in final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except AuthenticationError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_cohere_stream_bad_key()
+
+# def test_completion_nlp_cloud():
+#     try:
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(model="dolphin", messages=messages, stream=True)
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         has_finish_reason = False
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finish_reason is False:
+#             raise Exception("Finish reason not in final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_nlp_cloud()
+
+# def test_completion_nlp_cloud_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(model="dolphin", messages=messages, stream=True, api_key=api_key)
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         has_finish_reason = False
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finish_reason is False:
+#             raise Exception("Finish reason not in final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_nlp_cloud_bad_key()
+
 # def test_completion_hf_stream():
 #     try:
 #         messages = [
@ -235,10 +335,41 @@ def test_completion_cohere_stream():
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e:
+#         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")

-# test_completion_hf_stream()
+# # test_completion_hf_stream()
+
+# def test_completion_hf_stream_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         messages = [
+#             {
+#                 "content": "Hello! How are you today?",
+#                 "role": "user"
+#             },
+#         ]
+#         response = completion(
+#             model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base="https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud", stream=True, max_tokens=1000, api_key=api_key
+#         )
+#         complete_response = ""
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e:
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_hf_stream_bad_key()

 def test_completion_claude_stream():
    try:
@ -266,19 +397,22 @@ def test_completion_claude_stream():
        pytest.fail(f"Error occurred: {e}")
 # test_completion_claude_stream()

-def test_completion_bedrock_ai21_stream():
+
+def test_completion_claude_stream_bad_key():
    try:
-        litellm.set_verbose = False
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
        response = completion(
-            model="bedrock/amazon.titan-tg1-large", 
-            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
-            temperature=1,
-            max_tokens=4096,
-            stream=True,
+            model="claude-instant-1", messages=messages, stream=True, max_tokens=50, api_key=api_key
        )
-        complete_response = "" 
-        # Add any assertions here to check the response 
-        print(response)
+        complete_response = ""
+        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
@ -286,11 +420,263 @@ def test_completion_bedrock_ai21_stream():
            complete_response += chunk
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")


-# test_completion_cohere_stream()
+# test_completion_claude_stream_bad_key() 
+
+def test_completion_replicate_stream():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_replicate_stream()
+
+# def test_completion_vertexai_stream():
+#     try:
+#         import os 
+#         os.environ["VERTEXAI_PROJECT"] = "pathrise-convert-1606954137718"
+#         os.environ["VERTEXAI_LOCATION"] = "us-central1"
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(
+#             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_vertexai_stream()
+
+
+# def test_completion_vertexai_stream_bad_key():
+#     try:
+#         import os 
+#         messages = [
+#             {"role": "system", "content": "You are a helpful assistant."},
+#             {
+#                 "role": "user",
+#                 "content": "how does a court case get to the Supreme Court?",
+#             },
+#         ]
+#         response = completion(
+#             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#         print(f"completion_response: {complete_response}")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_vertexai_stream_bad_key()
+
+def test_completion_replicate_stream():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+def test_completion_replicate_stream_bad_key():
+    try:
+        api_key = "bad-key"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50, api_key=api_key
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            if finished:
+                break
+            complete_response += chunk
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_replicate_stream_bad_key()
+
+def test_completion_bedrock_ai21_stream():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
+            temperature=1,
+            max_tokens=4096,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_bedrock_ai21_stream() 
+
+def test_completion_bedrock_ai21_stream_bad_key():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
+            temperature=1,
+            max_tokens=4096,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_bedrock_ai21_stream_bad_key() 
+
+def test_completion_sagemaker_stream():
+    try:
+        response = completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+    except InvalidRequestError as e: 
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+test_completion_sagemaker_stream()

 # test on openai completion call
 def test_openai_text_completion_call():
@ -314,7 +700,33 @@ def test_openai_text_completion_call():
 def ai21_completion_call():
    try:
        response = completion(
-            model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn
+            model="j2-ultra", messages=messages, stream=True
+        )
+        print(f"response: {response}")
+        has_finished = False
+        complete_response = ""
+        start_time = time.time()
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finished = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finished is False:
+            raise Exception("finished reason missing from final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except:
+        pytest.fail(f"error occurred: {traceback.format_exc()}")
+
+# ai21_completion_call()
+
+def ai21_completion_call_bad_key():
+    try:
+        api_key = "bad-key"
+        response = completion(
+            model="j2-ultra", messages=messages, stream=True, api_key=api_key
        )
        print(f"response: {response}")
        complete_response = ""
@ -327,10 +739,64 @@ def ai21_completion_call():
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
+    except InvalidRequestError as e: 
+        pass
    except:
        pytest.fail(f"error occurred: {traceback.format_exc()}")

-# ai21_completion_call()
+# ai21_completion_call_bad_key()
+
+def test_completion_aleph_alpha():
+    try:
+        response = completion(
+            model="luminous-base", messages=messages, stream=True
+        )
+        # Add any assertions here to check the response
+        has_finished = False
+        complete_response = ""
+        start_time = time.time()
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finished = finished
+            complete_response += chunk
+            if finished:
+                break
+        if has_finished is False:
+            raise Exception("finished reason missing from final chunk")
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_completion_aleph_alpha()
+
+# def test_completion_aleph_alpha_bad_key():
+#     try:
+#         api_key = "bad-key"
+#         response = completion(
+#             model="luminous-base", messages=messages, stream=True, api_key=api_key
+#         )
+#         # Add any assertions here to check the response
+#         has_finished = False
+#         complete_response = ""
+#         start_time = time.time()
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finished = finished
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if has_finished is False:
+#             raise Exception("finished reason missing from final chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_aleph_alpha_bad_key()
+
 # test on openai completion call
 def test_openai_chat_completion_call():
    try:
@ -366,11 +832,15 @@ def test_together_ai_completion_call_starcoder():
        )
        complete_response = ""
        print(f"returned response object: {response}")
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not set for last chunk")
        if complete_response == "":
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
@ -378,6 +848,38 @@ def test_together_ai_completion_call_starcoder():
        print(f"error occurred: {traceback.format_exc()}")
        pass

+# test_together_ai_completion_call_starcoder() 
+
+def test_together_ai_completion_call_starcoder_bad_key():
+    try:
+        api_key = "bad-key"
+        start_time = time.time()
+        response = completion(
+            model="together_ai/bigcode/starcoder",
+            messages=messages,
+            stream=True,
+            api_key=api_key
+        )
+        complete_response = ""
+        has_finish_reason = False
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("Finish reason not set for last chunk")
+        if complete_response == "":
+            raise Exception("Empty response received")
+        print(f"complete response: {complete_response}")
+    except InvalidRequestError as e:
+        pass
+    except:
+        print(f"error occurred: {traceback.format_exc()}")
+        pass
+
+# test_together_ai_completion_call_starcoder_bad_key() 
 #### Test Function calling + streaming ####

 def test_completion_openai_with_functions():
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2,6 +2,7 @@ import sys
 import dotenv, json, traceback, threading
 import subprocess, os
 import litellm, openai
+import itertools
 import random, uuid, requests
 import datetime, time
 import tiktoken
@ -1915,7 +1916,6 @@ def exception_type(
    ):
    global user_logger_fn, liteDebuggerClient
    exception_mapping_worked = False
-
    if litellm.set_verbose == True:
        litellm.error_logs['EXCEPTION'] = original_exception
        litellm.error_logs['KWARGS'] = completion_kwargs
@ -1970,7 +1970,7 @@ def exception_type(
                exception_type = type(original_exception).__name__
            else:
                exception_type = ""
-            if "claude" in model:  # one of the anthropics
+            if custom_llm_provider == "anthropic":  # one of the anthropics
                if hasattr(original_exception, "message"):
                    if "prompt is too long" in original_exception.message:
                        exception_mapping_worked = True
@ -1979,6 +1979,13 @@ def exception_type(
                            model=model,
                            llm_provider="anthropic"
                        )
+                    if "Invalid API Key" in original_exception.message:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=original_exception.message, 
+                            model=model,
+                            llm_provider="anthropic"
+                        )
                if hasattr(original_exception, "status_code"):
                    print_verbose(f"status_code: {original_exception.status_code}")
                    if original_exception.status_code == 401:
@ -2031,7 +2038,7 @@ def exception_type(
                            llm_provider="anthropic",
                            model=model
                        )
-            elif "replicate" in model:
+            elif custom_llm_provider == "replicate":
                if "Incorrect authentication token" in error_str:
                    exception_mapping_worked = True
                    raise AuthenticationError(
@ -2068,7 +2075,7 @@ def exception_type(
                            llm_provider="replicate",
                            model=model
                        )
-                    elif original_exception.status_code == 400:
+                    elif original_exception.status_code == 400 or original_exception.status_code == 422:
                        exception_mapping_worked = True
                        raise InvalidRequestError(
                            message=f"ReplicateException - {original_exception.message}",
@ -2110,7 +2117,31 @@ def exception_type(
                    llm_provider="replicate",
                    model=model
                )
-            elif model in litellm.cohere_models or custom_llm_provider == "cohere":  # Cohere
+            elif custom_llm_provider == "bedrock":
+                if "Unable to locate credentials" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"BedrockException - {error_str}", 
+                        model=model, 
+                        llm_provider="bedrock"
+                    )
+            elif custom_llm_provider == "sagemaker": 
+                if "Unable to locate credentials" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"SagemakerException - {error_str}", 
+                        model=model, 
+                        llm_provider="sagemaker"
+                    )
+            elif custom_llm_provider == "vertex_ai":
+                if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"VertexAIException - {error_str}", 
+                        model=model, 
+                        llm_provider="vertex_ai"
+                    )
+            elif custom_llm_provider == "cohere":  # Cohere
                if (
                    "invalid api token" in error_str
                    or "No API key provided." in error_str
@ -2184,6 +2215,13 @@ def exception_type(
                        model=model,
                        llm_provider="huggingface"
                    )
+                elif "A valid user token is required" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=error_str, 
+                        llm_provider="huggingface",
+                        model=model
+                    )
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
@ -2221,6 +2259,8 @@ def exception_type(
                            llm_provider="huggingface",
                            model=model
                        )
+                exception_mapping_worked = True
+                raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
            elif custom_llm_provider == "ai21":
                if hasattr(original_exception, "message"):
                    if "Prompt has too many tokens" in original_exception.message:
@ -2230,6 +2270,13 @@ def exception_type(
                            model=model,
                            llm_provider="ai21"
                        )
+                    if "Bad or missing API token." in original_exception.message: 
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"AI21Exception - {original_exception.message}",
+                            model=model,
+                            llm_provider="ai21"
+                        )
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
@ -2266,7 +2313,7 @@ def exception_type(
                            llm_provider="ai21",
                            model=model
                        )
-            elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+            elif custom_llm_provider == "nlp_cloud":
                if "detail" in error_str:
                    if "Input text length should not exceed" in error_str:
                        exception_mapping_worked = True
@ -2342,6 +2389,7 @@ def exception_type(
                            model=model
                        )
            elif custom_llm_provider == "together_ai":
+                import json
                error_response = json.loads(error_str)
                if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
                    exception_mapping_worked = True
@ -2364,6 +2412,13 @@ def exception_type(
                        model=model,
                        llm_provider="together_ai"
                    )
+                elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"TogetherAIException - {error_response['error']}",
+                        model=model,
+                        llm_provider="together_ai"
+                    )
                elif "error_type" in error_response and error_response["error_type"] == "validation":
                    exception_mapping_worked = True
                    raise InvalidRequestError(
@ -2393,7 +2448,7 @@ def exception_type(
                        llm_provider="together_ai",
                        model=model
                    )
-            elif model in litellm.aleph_alpha_models:
+            elif custom_llm_provider == "aleph_alpha":
                if "This is longer than the model's maximum context length" in error_str:
                    exception_mapping_worked = True
                    raise ContextWindowExceededError(
@ -2401,6 +2456,13 @@ def exception_type(
                        llm_provider="aleph_alpha", 
                        model=model
                    )
+                elif "InvalidToken" in error_str or "No token provided" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"AlephAlphaException - {original_exception.message}",
+                        llm_provider="aleph_alpha", 
+                        model=model
+                    )
                elif hasattr(original_exception, "status_code"):
                    print(f"status code: {original_exception.status_code}")
                    if original_exception.status_code == 401:
@ -2445,7 +2507,8 @@ def exception_type(
            elif custom_llm_provider == "ollama":
                if "no attribute 'async_get_ollama_response_stream" in error_str:
                    raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
-        raise original_exception
+        exception_mapping_worked = True
+        raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
    except Exception as e:
        # LOGGING
        exception_logging(
@ -2563,6 +2626,7 @@ class CustomStreamWrapper:
        self.logging_obj = logging_obj
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
+        self.sent_last_chunk = False
        if self.logging_obj:
                # Log the type of the received item
                self.logging_obj.post_call(str(type(completion_stream)))
@ -2579,41 +2643,71 @@ class CustomStreamWrapper:

    def handle_anthropic_chunk(self, chunk):
        str_line = chunk.decode("utf-8")  # Convert bytes to string
+        print(f"str_line: {str_line}")
+        text = "" 
+        is_finished = False
+        finish_reason = None
        if str_line.startswith("data:"):
            data_json = json.loads(str_line[5:])
-            return data_json.get("completion", "")
-        return ""
+            text = data_json.get("completion", "") 
+            if data_json.get("stop_reason", None): 
+                is_finished = True
+                finish_reason = data_json["stop_reason"]
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "error" in str_line:
+            raise ValueError(f"Unable to parse response. Original response: {str_line}")
+        else:
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}

    def handle_together_ai_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
-        text_index = chunk.find('"text":"')  # this checks if text: exists
-        text_start = text_index + len('"text":"')
-        text_end = chunk.find('"}', text_start)
-        if text_index != -1 and text_end != -1:
-            extracted_text = chunk[text_start:text_end]
-            return extracted_text
+        text = "" 
+        is_finished = False
+        finish_reason = None
+        if "text" in chunk: 
+            text_index = chunk.find('"text":"')  # this checks if text: exists
+            text_start = text_index + len('"text":"')
+            text_end = chunk.find('"}', text_start)
+            if text_index != -1 and text_end != -1:
+                extracted_text = chunk[text_start:text_end]
+                text = extracted_text
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "[DONE]" in chunk:
+            return {"text": text, "is_finished": True, "finish_reason": "stop"}
+        elif "error" in chunk:
+            raise ValueError(chunk)
        else:
-            return ""
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}

    def handle_huggingface_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
+        text = "" 
+        is_finished = False
+        finish_reason = ""
        if chunk.startswith("data:"):
            data_json = json.loads(chunk[5:])
+            print(f"data json: {data_json}")
            if "token" in data_json and "text" in data_json["token"]:
                text = data_json["token"]["text"]
                if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
                    if any(token in text for token in llama_2_special_tokens):
                        text = text.replace("<s>", "").replace("</s>", "")
-                return text
-            else:
-                return ""
-        return ""
+            if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
+                is_finished = True
+                finish_reason = data_json["details"]["finish_reason"]
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        elif "error" in chunk: 
+            raise ValueError(chunk)
+        return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
    
-    def handle_ai21_chunk(self, chunk):
+    def handle_ai21_chunk(self, chunk): # fake streaming
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["completions"][0]["data"]["text"]
+            text = data_json["completions"][0]["data"]["text"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
    
@ -2621,8 +2715,10 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            print(f"data json: {data_json}")
-            return data_json["generated_text"]
+            text = data_json["generated_text"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
    
@ -2630,7 +2726,10 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["completions"][0]["completion"]
+            text = data_json["completions"][0]["completion"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
    
@ -2638,7 +2737,35 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["text"]
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "text" in data_json: 
+                text = data_json["text"]
+            elif "is_finished" in data_json: 
+                is_finished = data_json["is_finished"]
+                finish_reason = data_json["finish_reason"]
+            else: 
+                raise Exception(data_json)
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        except:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+    
+    def handle_replicate_chunk(self, chunk):
+        print(f"chunk: {chunk}")
+        try:
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "output" in chunk: 
+                text = chunk['output']
+            if "status" in chunk: 
+                if chunk["status"] == "succeeded":
+                    is_finished = True
+                    finish_reason = "stop"
+            elif chunk.get("error", None): 
+                raise Exception(chunk["error"])
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
    
@ -2683,13 +2810,21 @@ class CustomStreamWrapper:
            traceback.print_exc()
            return ""

-    def handle_bedrock_stream(self):
-        if self.completion_stream:
-            event = next(self.completion_stream)
-            chunk = event.get('chunk')
-            if chunk:
-                chunk_data = json.loads(chunk.get('bytes').decode())
-                return chunk_data['outputText']
+    def handle_bedrock_stream(self, chunk):
+        chunk = chunk.get('chunk')
+        if chunk:
+            chunk_data = json.loads(chunk.get('bytes').decode())
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            if "outputText" in chunk_data: 
+                text = chunk_data['outputText']
+            if chunk_data.get("completionReason", None): 
+                is_finished = True
+                finish_reason = chunk_data["completionReason"]
+            elif chunk.get("error", None): 
+                raise Exception(chunk["error"])
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        return ""

    ## needs to handle the empty string case (even starting chunk can be an empty string)
@ -2701,49 +2836,94 @@ class CustomStreamWrapper:
                completion_obj = {"content": ""}
                if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_anthropic_chunk(chunk)
+                    response_obj = self.handle_anthropic_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = chunk
+                    response_obj = self.handle_replicate_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif (
                    self.custom_llm_provider and self.custom_llm_provider == "together_ai"):
                    chunk = next(self.completion_stream)
-                    text_data = self.handle_together_ai_chunk(chunk)
-                    if text_data == "":
-                        return self.__next__()
-                    completion_obj["content"] = text_data
+                    response_obj = self.handle_together_ai_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_huggingface_chunk(chunk)
+                    response_obj = self.handle_huggingface_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = self.handle_baseten_chunk(chunk)
                elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_ai21_chunk(chunk)
+                    response_obj = self.handle_ai21_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = chunk[0].outputs[0].text
-                elif self.custom_llm_provider and self.custom_llm_provider == "aleph-alpha": #aleph alpha doesn't provide streaming
+                elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk)
+                    response_obj = self.handle_aleph_alpha_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
                elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
-                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
-                elif self.model in (litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models):
-                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = str(chunk)
+                    try: 
+                        chunk = next(self.completion_stream)
+                        response_obj = self.handle_nlp_cloud_chunk(chunk)
+                        completion_obj["content"] = response_obj["text"]
+                        if response_obj["is_finished"]: 
+                            model_response.choices[0].finish_reason = response_obj["finish_reason"]
+                    except Exception as e:
+                        if self.sent_last_chunk:
+                            raise e
+                        else:
+                            if self.sent_first_chunk is False: 
+                                raise Exception("An unknown error occurred with the stream")
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
+                elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
+                    try:
+                        chunk = next(self.completion_stream)
+                        completion_obj["content"] = str(chunk)
+                    except StopIteration as e:
+                        if self.sent_last_chunk: 
+                            raise e 
+                        else:
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
                elif self.custom_llm_provider == "cohere":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_cohere_chunk(chunk)
+                    response_obj = self.handle_cohere_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider == "bedrock":
-                    completion_obj["content"] = self.handle_bedrock_stream()
+                    chunk = next(self.completion_stream)
+                    response_obj = self.handle_bedrock_stream(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider == "sagemaker":
                    if len(self.completion_stream)==0:
-                        raise StopIteration
+                        if self.sent_last_chunk: 
+                            raise StopIteration
+                        else:
+                            model_response.choices[0].finish_reason = "stop"
+                            self.sent_last_chunk = True
                    chunk_size = 30
                    new_chunk = self.completion_stream[:chunk_size]
                    completion_obj["content"] = new_chunk
@ -2765,11 +2945,13 @@ class CustomStreamWrapper:
                        self.sent_first_chunk = True
                    model_response.choices[0].delta = Delta(**completion_obj)
                    return model_response
+                elif model_response.choices[0].finish_reason:
+                    return model_response
        except StopIteration:
            raise StopIteration
-        except Exception as e:
-            model_response.choices[0].finish_reason = "stop"
-            return model_response
+        except Exception as e: 
+            e.message = str(e)
+            return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e)
    
    async def __anext__(self):
        try:
@ -2796,7 +2978,6 @@ def read_config_args(config_path) -> dict:
        # read keys/ values from config file and return them
        return config
    except Exception as e:
-        print("An error occurred while reading config:", str(e))
        raise e

 ########## experimental completion variants ############################
@ -2899,7 +3080,6 @@ def get_model_split_test(models, completion_call_id):
    try:
        # make the api call
        last_fetched_at = time.time()
-        print(f"last_fetched_at: {last_fetched_at}")
        response = requests.post(
            #http://api.litellm.ai
            url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.738"
+version = "0.1.739"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"