fix exception mapping for streaming

2025-04-26 03:04:13 +00:00 · 2023-09-23 15:04:34 -07:00 · 2023-09-23 15:04:34 -07:00 · 889679a0dd
commit 889679a0dd
parent f984e5f380
8 changed files with 766 additions and 100 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos
    }
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.0001)
+        time.sleep(0.0001) # prevent being rate limited by replicate
        response = requests.get(prediction_url, headers=headers)
        if response.status_code == 200:
            response_data = response.json()
            status = response_data['status']
            print(f"response data: {response_data}")
            if "output" in response_data:
                output_string = "".join(response_data['output'])
                new_output = output_string[len(previous_output):]
-                yield new_output
+                yield {"output": new_output, "status": status}
                previous_output = output_string
            status = response_data['status']
--- a/litellm/main.py
+++ b/litellm/main.py
@ -485,11 +485,11 @@ def completion(
            # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
            replicate_key = None
            replicate_key = (
-                get_secret("REPLICATE_API_KEY")
+                api_key
                or get_secret("REPLICATE_API_TOKEN")
                or api_key
                or litellm.replicate_key
                or litellm.api_key 
                or get_secret("REPLICATE_API_KEY")
                or get_secret("REPLICATE_API_TOKEN")
            )
            model_response = replicate.completion(
@ -575,7 +575,7 @@ def completion(
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging)
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                return response
            response = model_response
        elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@ -769,7 +769,7 @@ def completion(
            if stream:
                model_response = chat.send_message_streaming(prompt, **optional_params)
                response = CustomStreamWrapper(
-                    model_response, model, custom_llm_provider="vertexai", logging_obj=logging
+                    model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
                )
                return response
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -643,24 +643,6 @@ def test_completion_sagemaker():
 # test_completion_sagemaker()
 def test_completion_sagemaker_stream():
    litellm.set_verbose = False
    try:
        response = completion(
            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
            messages=messages,
            temperature=0.2,
            max_tokens=80,
            stream=True,
        )
        # Add any assertions here to check the response
        for chunk in response:
            print(chunk)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_sagemaker_stream()
 def test_completion_bedrock_titan():
    try:
        response = completion(
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -9,7 +9,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import completion, acompletion
+from litellm import completion, acompletion, AuthenticationError, InvalidRequestError
 litellm.logging = False
 litellm.set_verbose = False
@ -187,6 +187,7 @@ def streaming_format_tests(idx, chunk):
        finished = True
    if "content" in chunk["choices"][0]["delta"]:
        extracted_chunk = chunk["choices"][0]["delta"]["content"]
    print(f"extracted chunk: {extracted_chunk}")
    return extracted_chunk, finished
 def test_completion_cohere_stream():
@ -199,21 +200,120 @@ def test_completion_cohere_stream():
            },
        ]
        response = completion(
-            model="command-nightly", messages=messages, stream=True, max_tokens=50
+            model="command-nightly", messages=messages, stream=True, max_tokens=50,
        )
        complete_response = ""
        # Add any assertions here to check the response
        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("Finish reason not in final chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_cohere_stream()
 def test_completion_cohere_stream_bad_key():
    try:
        api_key = "bad-key"
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(
            model="command-nightly", messages=messages, stream=True, max_tokens=50, api_key=api_key
        )
        complete_response = ""
        # Add any assertions here to check the response
        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("Finish reason not in final chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except AuthenticationError as e: 
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_cohere_stream_bad_key()
 # def test_completion_nlp_cloud():
 #     try:
 #         messages = [
 #             {"role": "system", "content": "You are a helpful assistant."},
 #             {
 #                 "role": "user",
 #                 "content": "how does a court case get to the Supreme Court?",
 #             },
 #         ]
 #         response = completion(model="dolphin", messages=messages, stream=True)
 #         complete_response = ""
 #         # Add any assertions here to check the response
 #         has_finish_reason = False
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             has_finish_reason = finished
 #             complete_response += chunk
 #             if finished:
 #                 break
 #         if has_finish_reason is False:
 #             raise Exception("Finish reason not in final chunk")
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_nlp_cloud()
 # def test_completion_nlp_cloud_bad_key():
 #     try:
 #         api_key = "bad-key"
 #         messages = [
 #             {"role": "system", "content": "You are a helpful assistant."},
 #             {
 #                 "role": "user",
 #                 "content": "how does a court case get to the Supreme Court?",
 #             },
 #         ]
 #         response = completion(model="dolphin", messages=messages, stream=True, api_key=api_key)
 #         complete_response = ""
 #         # Add any assertions here to check the response
 #         has_finish_reason = False
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             has_finish_reason = finished
 #             complete_response += chunk
 #             if finished:
 #                 break
 #         if has_finish_reason is False:
 #             raise Exception("Finish reason not in final chunk")
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_nlp_cloud_bad_key()
 # def test_completion_hf_stream():
 #     try:
 #         messages = [
@ -235,10 +335,41 @@ def test_completion_cohere_stream():
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except InvalidRequestError as e:
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
-# test_completion_hf_stream()
+# # test_completion_hf_stream()
 # def test_completion_hf_stream_bad_key():
 #     try:
 #         api_key = "bad-key"
 #         messages = [
 #             {
 #                 "content": "Hello! How are you today?",
 #                 "role": "user"
 #             },
 #         ]
 #         response = completion(
 #             model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base="https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud", stream=True, max_tokens=1000, api_key=api_key
 #         )
 #         complete_response = ""
 #         # Add any assertions here to check the response
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             if finished:
 #                 break
 #             complete_response += chunk
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except InvalidRequestError as e:
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_hf_stream_bad_key()
 def test_completion_claude_stream():
    try:
@ -266,9 +397,202 @@ def test_completion_claude_stream():
        pytest.fail(f"Error occurred: {e}")
 # test_completion_claude_stream()
 def test_completion_claude_stream_bad_key():
    try:
        api_key = "bad-key"
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(
            model="claude-instant-1", messages=messages, stream=True, max_tokens=50, api_key=api_key
        )
        complete_response = ""
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
                break
            complete_response += chunk
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_claude_stream_bad_key() 
 def test_completion_replicate_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(
            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except InvalidRequestError as e: 
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_replicate_stream()
 # def test_completion_vertexai_stream():
 #     try:
 #         import os 
 #         os.environ["VERTEXAI_PROJECT"] = "pathrise-convert-1606954137718"
 #         os.environ["VERTEXAI_LOCATION"] = "us-central1"
 #         messages = [
 #             {"role": "system", "content": "You are a helpful assistant."},
 #             {
 #                 "role": "user",
 #                 "content": "how does a court case get to the Supreme Court?",
 #             },
 #         ]
 #         response = completion(
 #             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
 #         )
 #         complete_response = ""
 #         has_finish_reason = False
 #         # Add any assertions here to check the response
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             has_finish_reason = finished
 #             if finished:
 #                 break
 #             complete_response += chunk
 #         if has_finish_reason is False:
 #             raise Exception("finish reason not set for last chunk")
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except InvalidRequestError as e: 
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_vertexai_stream()
 # def test_completion_vertexai_stream_bad_key():
 #     try:
 #         import os 
 #         messages = [
 #             {"role": "system", "content": "You are a helpful assistant."},
 #             {
 #                 "role": "user",
 #                 "content": "how does a court case get to the Supreme Court?",
 #             },
 #         ]
 #         response = completion(
 #             model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
 #         )
 #         complete_response = ""
 #         has_finish_reason = False
 #         # Add any assertions here to check the response
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             has_finish_reason = finished
 #             if finished:
 #                 break
 #             complete_response += chunk
 #         if has_finish_reason is False:
 #             raise Exception("finish reason not set for last chunk")
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #         print(f"completion_response: {complete_response}")
 #     except InvalidRequestError as e: 
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_vertexai_stream_bad_key()
 def test_completion_replicate_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(
            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except InvalidRequestError as e: 
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_replicate_stream_bad_key():
    try:
        api_key = "bad-key"
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(
            model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50, api_key=api_key
        )
        complete_response = ""
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
                break
            complete_response += chunk
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except InvalidRequestError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_replicate_stream_bad_key()
 def test_completion_bedrock_ai21_stream():
    try:
        litellm.set_verbose = False
        response = completion(
            model="bedrock/amazon.titan-tg1-large", 
            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
@ -277,20 +601,82 @@ def test_completion_bedrock_ai21_stream():
            stream=True,
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        print(response)
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            complete_response += chunk
            if finished:
                break
-            complete_response += chunk
+        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_ai21_stream() 
-# test_completion_cohere_stream()
+def test_completion_bedrock_ai21_stream_bad_key():
    try:
        response = completion(
            model="bedrock/amazon.titan-tg1-large", 
            messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
            temperature=1,
            max_tokens=4096,
            stream=True,
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except InvalidRequestError as e: 
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_ai21_stream_bad_key() 
 def test_completion_sagemaker_stream():
    try:
        response = completion(
            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
            messages=messages,
            temperature=0.2,
            max_tokens=80,
            stream=True,
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
    except InvalidRequestError as e: 
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 test_completion_sagemaker_stream()
 # test on openai completion call
 def test_openai_text_completion_call():
@ -314,7 +700,33 @@ def test_openai_text_completion_call():
 def ai21_completion_call():
    try:
        response = completion(
-            model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn
+            model="j2-ultra", messages=messages, stream=True
        )
        print(f"response: {response}")
        has_finished = False
        complete_response = ""
        start_time = time.time()
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finished = finished
            complete_response += chunk
            if finished:
                break
        if has_finished is False:
            raise Exception("finished reason missing from final chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except:
        pytest.fail(f"error occurred: {traceback.format_exc()}")
 # ai21_completion_call()
 def ai21_completion_call_bad_key():
    try:
        api_key = "bad-key"
        response = completion(
            model="j2-ultra", messages=messages, stream=True, api_key=api_key
        )
        print(f"response: {response}")
        complete_response = ""
@ -327,10 +739,64 @@ def ai21_completion_call():
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except InvalidRequestError as e: 
        pass
    except:
        pytest.fail(f"error occurred: {traceback.format_exc()}")
-# ai21_completion_call()
+# ai21_completion_call_bad_key()
 def test_completion_aleph_alpha():
    try:
        response = completion(
            model="luminous-base", messages=messages, stream=True
        )
        # Add any assertions here to check the response
        has_finished = False
        complete_response = ""
        start_time = time.time()
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finished = finished
            complete_response += chunk
            if finished:
                break
        if has_finished is False:
            raise Exception("finished reason missing from final chunk")
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_aleph_alpha()
 # def test_completion_aleph_alpha_bad_key():
 #     try:
 #         api_key = "bad-key"
 #         response = completion(
 #             model="luminous-base", messages=messages, stream=True, api_key=api_key
 #         )
 #         # Add any assertions here to check the response
 #         has_finished = False
 #         complete_response = ""
 #         start_time = time.time()
 #         for idx, chunk in enumerate(response):
 #             chunk, finished = streaming_format_tests(idx, chunk)
 #             has_finished = finished
 #             complete_response += chunk
 #             if finished:
 #                 break
 #         if has_finished is False:
 #             raise Exception("finished reason missing from final chunk")
 #         if complete_response.strip() == "": 
 #             raise Exception("Empty response received")
 #     except InvalidRequestError as e: 
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_aleph_alpha_bad_key()
 # test on openai completion call
 def test_openai_chat_completion_call():
    try:
@ -366,11 +832,15 @@ def test_together_ai_completion_call_starcoder():
        )
        complete_response = ""
        print(f"returned response object: {response}")
        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("Finish reason not set for last chunk")
        if complete_response == "":
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
@ -378,6 +848,38 @@ def test_together_ai_completion_call_starcoder():
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # test_together_ai_completion_call_starcoder() 
 def test_together_ai_completion_call_starcoder_bad_key():
    try:
        api_key = "bad-key"
        start_time = time.time()
        response = completion(
            model="together_ai/bigcode/starcoder",
            messages=messages,
            stream=True,
            api_key=api_key
        )
        complete_response = ""
        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("Finish reason not set for last chunk")
        if complete_response == "":
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
    except InvalidRequestError as e:
        pass
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # test_together_ai_completion_call_starcoder_bad_key() 
 #### Test Function calling + streaming ####
 def test_completion_openai_with_functions():
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2,6 +2,7 @@ import sys
 import dotenv, json, traceback, threading
 import subprocess, os
 import litellm, openai
 import itertools
 import random, uuid, requests
 import datetime, time
 import tiktoken
@ -1915,7 +1916,6 @@ def exception_type(
    ):
    global user_logger_fn, liteDebuggerClient
    exception_mapping_worked = False
    if litellm.set_verbose == True:
        litellm.error_logs['EXCEPTION'] = original_exception
        litellm.error_logs['KWARGS'] = completion_kwargs
@ -1970,7 +1970,7 @@ def exception_type(
                exception_type = type(original_exception).__name__
            else:
                exception_type = ""
-            if "claude" in model:  # one of the anthropics
+            if custom_llm_provider == "anthropic":  # one of the anthropics
                if hasattr(original_exception, "message"):
                    if "prompt is too long" in original_exception.message:
                        exception_mapping_worked = True
@ -1979,6 +1979,13 @@ def exception_type(
                            model=model,
                            llm_provider="anthropic"
                        )
                    if "Invalid API Key" in original_exception.message:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=original_exception.message, 
                            model=model,
                            llm_provider="anthropic"
                        )
                if hasattr(original_exception, "status_code"):
                    print_verbose(f"status_code: {original_exception.status_code}")
                    if original_exception.status_code == 401:
@ -2031,7 +2038,7 @@ def exception_type(
                            llm_provider="anthropic",
                            model=model
                        )
-            elif "replicate" in model:
+            elif custom_llm_provider == "replicate":
                if "Incorrect authentication token" in error_str:
                    exception_mapping_worked = True
                    raise AuthenticationError(
@ -2068,7 +2075,7 @@ def exception_type(
                            llm_provider="replicate",
                            model=model
                        )
-                    elif original_exception.status_code == 400:
+                    elif original_exception.status_code == 400 or original_exception.status_code == 422:
                        exception_mapping_worked = True
                        raise InvalidRequestError(
                            message=f"ReplicateException - {original_exception.message}",
@ -2110,7 +2117,31 @@ def exception_type(
                    llm_provider="replicate",
                    model=model
                )
-            elif model in litellm.cohere_models or custom_llm_provider == "cohere":  # Cohere
+            elif custom_llm_provider == "bedrock":
                if "Unable to locate credentials" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=f"BedrockException - {error_str}", 
                        model=model, 
                        llm_provider="bedrock"
                    )
            elif custom_llm_provider == "sagemaker": 
                if "Unable to locate credentials" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=f"SagemakerException - {error_str}", 
                        model=model, 
                        llm_provider="sagemaker"
                    )
            elif custom_llm_provider == "vertex_ai":
                if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=f"VertexAIException - {error_str}", 
                        model=model, 
                        llm_provider="vertex_ai"
                    )
            elif custom_llm_provider == "cohere":  # Cohere
                if (
                    "invalid api token" in error_str
                    or "No API key provided." in error_str
@ -2184,6 +2215,13 @@ def exception_type(
                        model=model,
                        llm_provider="huggingface"
                    )
                elif "A valid user token is required" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=error_str, 
                        llm_provider="huggingface",
                        model=model
                    )
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
@ -2221,6 +2259,8 @@ def exception_type(
                            llm_provider="huggingface",
                            model=model
                        )
                exception_mapping_worked = True
                raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
            elif custom_llm_provider == "ai21":
                if hasattr(original_exception, "message"):
                    if "Prompt has too many tokens" in original_exception.message:
@ -2230,6 +2270,13 @@ def exception_type(
                            model=model,
                            llm_provider="ai21"
                        )
                    if "Bad or missing API token." in original_exception.message: 
                        exception_mapping_worked = True
                        raise InvalidRequestError(
                            message=f"AI21Exception - {original_exception.message}",
                            model=model,
                            llm_provider="ai21"
                        )
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
@ -2266,7 +2313,7 @@ def exception_type(
                            llm_provider="ai21",
                            model=model
                        )
-            elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+            elif custom_llm_provider == "nlp_cloud":
                if "detail" in error_str:
                    if "Input text length should not exceed" in error_str:
                        exception_mapping_worked = True
@ -2342,6 +2389,7 @@ def exception_type(
                            model=model
                        )
            elif custom_llm_provider == "together_ai":
                import json
                error_response = json.loads(error_str)
                if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
                    exception_mapping_worked = True
@ -2364,6 +2412,13 @@ def exception_type(
                        model=model,
                        llm_provider="together_ai"
                    )
                elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=f"TogetherAIException - {error_response['error']}",
                        model=model,
                        llm_provider="together_ai"
                    )
                elif "error_type" in error_response and error_response["error_type"] == "validation":
                    exception_mapping_worked = True
                    raise InvalidRequestError(
@ -2393,7 +2448,7 @@ def exception_type(
                        llm_provider="together_ai",
                        model=model
                    )
-            elif model in litellm.aleph_alpha_models:
+            elif custom_llm_provider == "aleph_alpha":
                if "This is longer than the model's maximum context length" in error_str:
                    exception_mapping_worked = True
                    raise ContextWindowExceededError(
@ -2401,6 +2456,13 @@ def exception_type(
                        llm_provider="aleph_alpha", 
                        model=model
                    )
                elif "InvalidToken" in error_str or "No token provided" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
                        message=f"AlephAlphaException - {original_exception.message}",
                        llm_provider="aleph_alpha", 
                        model=model
                    )
                elif hasattr(original_exception, "status_code"):
                    print(f"status code: {original_exception.status_code}")
                    if original_exception.status_code == 401:
@ -2445,7 +2507,8 @@ def exception_type(
            elif custom_llm_provider == "ollama":
                if "no attribute 'async_get_ollama_response_stream" in error_str:
                    raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
-        raise original_exception
+        exception_mapping_worked = True
        raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
    except Exception as e:
        # LOGGING
        exception_logging(
@ -2563,6 +2626,7 @@ class CustomStreamWrapper:
        self.logging_obj = logging_obj
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        self.sent_last_chunk = False
        if self.logging_obj:
                # Log the type of the received item
                self.logging_obj.post_call(str(type(completion_stream)))
@ -2579,41 +2643,71 @@ class CustomStreamWrapper:
    def handle_anthropic_chunk(self, chunk):
        str_line = chunk.decode("utf-8")  # Convert bytes to string
        print(f"str_line: {str_line}")
        text = "" 
        is_finished = False
        finish_reason = None
        if str_line.startswith("data:"):
            data_json = json.loads(str_line[5:])
-            return data_json.get("completion", "")
+            text = data_json.get("completion", "") 
-        return ""
+            if data_json.get("stop_reason", None): 
                is_finished = True
                finish_reason = data_json["stop_reason"]
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        elif "error" in str_line:
            raise ValueError(f"Unable to parse response. Original response: {str_line}")
        else:
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
    def handle_together_ai_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
        text = "" 
        is_finished = False
        finish_reason = None
        if "text" in chunk: 
            text_index = chunk.find('"text":"')  # this checks if text: exists
            text_start = text_index + len('"text":"')
            text_end = chunk.find('"}', text_start)
            if text_index != -1 and text_end != -1:
                extracted_text = chunk[text_start:text_end]
-            return extracted_text
+                text = extracted_text
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        elif "[DONE]" in chunk:
            return {"text": text, "is_finished": True, "finish_reason": "stop"}
        elif "error" in chunk:
            raise ValueError(chunk)
        else:
-            return ""
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
    def handle_huggingface_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
        text = "" 
        is_finished = False
        finish_reason = ""
        if chunk.startswith("data:"):
            data_json = json.loads(chunk[5:])
            print(f"data json: {data_json}")
            if "token" in data_json and "text" in data_json["token"]:
                text = data_json["token"]["text"]
                if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
                    if any(token in text for token in llama_2_special_tokens):
                        text = text.replace("<s>", "").replace("</s>", "")
-                return text
+            if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
-            else:
+                is_finished = True
-                return ""
+                finish_reason = data_json["details"]["finish_reason"]
-        return ""
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        elif "error" in chunk: 
            raise ValueError(chunk)
        return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
-    def handle_ai21_chunk(self, chunk):
+    def handle_ai21_chunk(self, chunk): # fake streaming
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["completions"][0]["data"]["text"]
+            text = data_json["completions"][0]["data"]["text"]
            is_finished = True
            finish_reason = "stop"
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2621,8 +2715,10 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            print(f"data json: {data_json}")
+            text = data_json["generated_text"]
-            return data_json["generated_text"]
+            is_finished = True
            finish_reason = "stop"
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2630,7 +2726,10 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["completions"][0]["completion"]
+            text = data_json["completions"][0]["completion"]
            is_finished = True
            finish_reason = "stop"
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2638,7 +2737,35 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
-            return data_json["text"]
+            text = "" 
            is_finished = False
            finish_reason = ""
            if "text" in data_json: 
                text = data_json["text"]
            elif "is_finished" in data_json: 
                is_finished = data_json["is_finished"]
                finish_reason = data_json["finish_reason"]
            else: 
                raise Exception(data_json)
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
    def handle_replicate_chunk(self, chunk):
        print(f"chunk: {chunk}")
        try:
            text = "" 
            is_finished = False
            finish_reason = ""
            if "output" in chunk: 
                text = chunk['output']
            if "status" in chunk: 
                if chunk["status"] == "succeeded":
                    is_finished = True
                    finish_reason = "stop"
            elif chunk.get("error", None): 
                raise Exception(chunk["error"])
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2683,13 +2810,21 @@ class CustomStreamWrapper:
            traceback.print_exc()
            return ""
-    def handle_bedrock_stream(self):
+    def handle_bedrock_stream(self, chunk):
-        if self.completion_stream:
+        chunk = chunk.get('chunk')
            event = next(self.completion_stream)
            chunk = event.get('chunk')
        if chunk:
            chunk_data = json.loads(chunk.get('bytes').decode())
-                return chunk_data['outputText']
+            text = "" 
            is_finished = False
            finish_reason = ""
            if "outputText" in chunk_data: 
                text = chunk_data['outputText']
            if chunk_data.get("completionReason", None): 
                is_finished = True
                finish_reason = chunk_data["completionReason"]
            elif chunk.get("error", None): 
                raise Exception(chunk["error"])
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        return ""
    ## needs to handle the empty string case (even starting chunk can be an empty string)
@ -2701,49 +2836,94 @@ class CustomStreamWrapper:
                completion_obj = {"content": ""}
                if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_anthropic_chunk(chunk)
+                    response_obj = self.handle_anthropic_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = chunk
+                    response_obj = self.handle_replicate_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif (
                    self.custom_llm_provider and self.custom_llm_provider == "together_ai"):
                    chunk = next(self.completion_stream)
-                    text_data = self.handle_together_ai_chunk(chunk)
+                    response_obj = self.handle_together_ai_chunk(chunk)
-                    if text_data == "":
+                    completion_obj["content"] = response_obj["text"]
-                        return self.__next__()
+                    if response_obj["is_finished"]: 
-                    completion_obj["content"] = text_data
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_huggingface_chunk(chunk)
+                    response_obj = self.handle_huggingface_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = self.handle_baseten_chunk(chunk)
                elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_ai21_chunk(chunk)
+                    response_obj = self.handle_ai21_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = chunk[0].outputs[0].text
-                elif self.custom_llm_provider and self.custom_llm_provider == "aleph-alpha": #aleph alpha doesn't provide streaming
+                elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk)
+                    response_obj = self.handle_aleph_alpha_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
                elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
                    try: 
                        chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
+                        response_obj = self.handle_nlp_cloud_chunk(chunk)
-                elif self.model in (litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models):
+                        completion_obj["content"] = response_obj["text"]
                        if response_obj["is_finished"]: 
                            model_response.choices[0].finish_reason = response_obj["finish_reason"]
                    except Exception as e:
                        if self.sent_last_chunk:
                            raise e
                        else:
                            if self.sent_first_chunk is False: 
                                raise Exception("An unknown error occurred with the stream")
                            model_response.choices[0].finish_reason = "stop"
                            self.sent_last_chunk = True
                elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
                    try:
                        chunk = next(self.completion_stream)
                        completion_obj["content"] = str(chunk)
                    except StopIteration as e:
                        if self.sent_last_chunk: 
                            raise e 
                        else:
                            model_response.choices[0].finish_reason = "stop"
                            self.sent_last_chunk = True
                elif self.custom_llm_provider == "cohere":
                    chunk = next(self.completion_stream)
-                    completion_obj["content"] = self.handle_cohere_chunk(chunk)
+                    response_obj = self.handle_cohere_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider == "bedrock":
-                    completion_obj["content"] = self.handle_bedrock_stream()
+                    chunk = next(self.completion_stream)
                    response_obj = self.handle_bedrock_stream(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider == "sagemaker":
                    if len(self.completion_stream)==0:
                        if self.sent_last_chunk: 
                            raise StopIteration
                        else:
                            model_response.choices[0].finish_reason = "stop"
                            self.sent_last_chunk = True
                    chunk_size = 30
                    new_chunk = self.completion_stream[:chunk_size]
                    completion_obj["content"] = new_chunk
@ -2765,11 +2945,13 @@ class CustomStreamWrapper:
                        self.sent_first_chunk = True
                    model_response.choices[0].delta = Delta(**completion_obj)
                    return model_response
                elif model_response.choices[0].finish_reason:
                    return model_response
        except StopIteration:
            raise StopIteration
        except Exception as e: 
-            model_response.choices[0].finish_reason = "stop"
+            e.message = str(e)
-            return model_response
+            return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e)
    async def __anext__(self):
        try:
@ -2796,7 +2978,6 @@ def read_config_args(config_path) -> dict:
        # read keys/ values from config file and return them
        return config
    except Exception as e:
        print("An error occurred while reading config:", str(e))
        raise e
 ########## experimental completion variants ############################
@ -2899,7 +3080,6 @@ def get_model_split_test(models, completion_call_id):
    try:
        # make the api call
        last_fetched_at = time.time()
        print(f"last_fetched_at: {last_fetched_at}")
        response = requests.post(
            #http://api.litellm.ai
            url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.738"
+version = "0.1.739"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"