diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 3907a3f34..b395309b6 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index bf977f9fd..5f546c9e3 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py index e63344492..e7c76d0ef 100644 --- a/litellm/llms/replicate.py +++ b/litellm/llms/replicate.py @@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos } status = "" while True and (status not in ["succeeded", "failed", "canceled"]): - time.sleep(0.0001) + time.sleep(0.0001) # prevent being rate limited by replicate response = requests.get(prediction_url, headers=headers) if response.status_code == 200: response_data = response.json() + status = response_data['status'] + print(f"response data: {response_data}") if "output" in response_data: output_string = "".join(response_data['output']) new_output = output_string[len(previous_output):] - yield new_output + yield {"output": new_output, "status": status} previous_output = output_string status = response_data['status'] diff --git a/litellm/main.py b/litellm/main.py index 1b2789f75..e34355317 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -485,11 +485,11 @@ def completion( # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN") replicate_key = None replicate_key = ( - get_secret("REPLICATE_API_KEY") - or get_secret("REPLICATE_API_TOKEN") - or api_key + api_key or litellm.replicate_key - or litellm.api_key + or litellm.api_key + or get_secret("REPLICATE_API_KEY") + or get_secret("REPLICATE_API_TOKEN") ) model_response = replicate.completion( @@ -575,7 +575,7 @@ def completion( if "stream" in optional_params and optional_params["stream"] == True: # don't try to access stream object, - response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging) + response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging) return response response = model_response elif model in litellm.openrouter_models or custom_llm_provider == "openrouter": @@ -769,7 +769,7 @@ def completion( if stream: model_response = chat.send_message_streaming(prompt, **optional_params) response = CustomStreamWrapper( - model_response, model, custom_llm_provider="vertexai", logging_obj=logging + model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging ) return response diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index da5994ccd..980aa14f2 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -643,24 +643,6 @@ def test_completion_sagemaker(): # test_completion_sagemaker() -def test_completion_sagemaker_stream(): - litellm.set_verbose = False - try: - response = completion( - model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", - messages=messages, - temperature=0.2, - max_tokens=80, - stream=True, - ) - # Add any assertions here to check the response - for chunk in response: - print(chunk) - except Exception as e: - pytest.fail(f"Error occurred: {e}") - -# test_completion_sagemaker_stream() - def test_completion_bedrock_titan(): try: response = completion( diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 10f772c25..495630300 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -9,7 +9,7 @@ sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import litellm -from litellm import completion, acompletion +from litellm import completion, acompletion, AuthenticationError, InvalidRequestError litellm.logging = False litellm.set_verbose = False @@ -187,6 +187,7 @@ def streaming_format_tests(idx, chunk): finished = True if "content" in chunk["choices"][0]["delta"]: extracted_chunk = chunk["choices"][0]["delta"]["content"] + print(f"extracted chunk: {extracted_chunk}") return extracted_chunk, finished def test_completion_cohere_stream(): @@ -199,21 +200,120 @@ def test_completion_cohere_stream(): }, ] response = completion( - model="command-nightly", messages=messages, stream=True, max_tokens=50 + model="command-nightly", messages=messages, stream=True, max_tokens=50, ) complete_response = "" # Add any assertions here to check the response + has_finish_reason = False for idx, chunk in enumerate(response): chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished if finished: break complete_response += chunk + if has_finish_reason is False: + raise Exception("Finish reason not in final chunk") if complete_response.strip() == "": raise Exception("Empty response received") print(f"completion_response: {complete_response}") except Exception as e: pytest.fail(f"Error occurred: {e}") +# test_completion_cohere_stream() + +def test_completion_cohere_stream_bad_key(): + try: + api_key = "bad-key" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion( + model="command-nightly", messages=messages, stream=True, max_tokens=50, api_key=api_key + ) + complete_response = "" + # Add any assertions here to check the response + has_finish_reason = False + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("Finish reason not in final chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except AuthenticationError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +# test_completion_cohere_stream_bad_key() + +# def test_completion_nlp_cloud(): +# try: +# messages = [ +# {"role": "system", "content": "You are a helpful assistant."}, +# { +# "role": "user", +# "content": "how does a court case get to the Supreme Court?", +# }, +# ] +# response = completion(model="dolphin", messages=messages, stream=True) +# complete_response = "" +# # Add any assertions here to check the response +# has_finish_reason = False +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# has_finish_reason = finished +# complete_response += chunk +# if finished: +# break +# if has_finish_reason is False: +# raise Exception("Finish reason not in final chunk") +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# print(f"completion_response: {complete_response}") +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_nlp_cloud() + +# def test_completion_nlp_cloud_bad_key(): +# try: +# api_key = "bad-key" +# messages = [ +# {"role": "system", "content": "You are a helpful assistant."}, +# { +# "role": "user", +# "content": "how does a court case get to the Supreme Court?", +# }, +# ] +# response = completion(model="dolphin", messages=messages, stream=True, api_key=api_key) +# complete_response = "" +# # Add any assertions here to check the response +# has_finish_reason = False +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# has_finish_reason = finished +# complete_response += chunk +# if finished: +# break +# if has_finish_reason is False: +# raise Exception("Finish reason not in final chunk") +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# print(f"completion_response: {complete_response}") +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_nlp_cloud_bad_key() + # def test_completion_hf_stream(): # try: # messages = [ @@ -235,10 +335,41 @@ def test_completion_cohere_stream(): # if complete_response.strip() == "": # raise Exception("Empty response received") # print(f"completion_response: {complete_response}") +# except InvalidRequestError as e: +# pass # except Exception as e: # pytest.fail(f"Error occurred: {e}") -# test_completion_hf_stream() +# # test_completion_hf_stream() + +# def test_completion_hf_stream_bad_key(): +# try: +# api_key = "bad-key" +# messages = [ +# { +# "content": "Hello! How are you today?", +# "role": "user" +# }, +# ] +# response = completion( +# model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base="https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud", stream=True, max_tokens=1000, api_key=api_key +# ) +# complete_response = "" +# # Add any assertions here to check the response +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# if finished: +# break +# complete_response += chunk +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# print(f"completion_response: {complete_response}") +# except InvalidRequestError as e: +# pass +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_hf_stream_bad_key() def test_completion_claude_stream(): try: @@ -266,19 +397,22 @@ def test_completion_claude_stream(): pytest.fail(f"Error occurred: {e}") # test_completion_claude_stream() -def test_completion_bedrock_ai21_stream(): + +def test_completion_claude_stream_bad_key(): try: - litellm.set_verbose = False + api_key = "bad-key" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] response = completion( - model="bedrock/amazon.titan-tg1-large", - messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}], - temperature=1, - max_tokens=4096, - stream=True, + model="claude-instant-1", messages=messages, stream=True, max_tokens=50, api_key=api_key ) - complete_response = "" - # Add any assertions here to check the response - print(response) + complete_response = "" + # Add any assertions here to check the response for idx, chunk in enumerate(response): chunk, finished = streaming_format_tests(idx, chunk) if finished: @@ -286,11 +420,263 @@ def test_completion_bedrock_ai21_stream(): complete_response += chunk if complete_response.strip() == "": raise Exception("Empty response received") + print(f"completion_response: {complete_response}") except Exception as e: pytest.fail(f"Error occurred: {e}") -# test_completion_cohere_stream() +# test_completion_claude_stream_bad_key() + +def test_completion_replicate_stream(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion( + model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50 + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except InvalidRequestError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") +# test_completion_replicate_stream() + +# def test_completion_vertexai_stream(): +# try: +# import os +# os.environ["VERTEXAI_PROJECT"] = "pathrise-convert-1606954137718" +# os.environ["VERTEXAI_LOCATION"] = "us-central1" +# messages = [ +# {"role": "system", "content": "You are a helpful assistant."}, +# { +# "role": "user", +# "content": "how does a court case get to the Supreme Court?", +# }, +# ] +# response = completion( +# model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50 +# ) +# complete_response = "" +# has_finish_reason = False +# # Add any assertions here to check the response +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# has_finish_reason = finished +# if finished: +# break +# complete_response += chunk +# if has_finish_reason is False: +# raise Exception("finish reason not set for last chunk") +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# print(f"completion_response: {complete_response}") +# except InvalidRequestError as e: +# pass +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_vertexai_stream() + + +# def test_completion_vertexai_stream_bad_key(): +# try: +# import os +# messages = [ +# {"role": "system", "content": "You are a helpful assistant."}, +# { +# "role": "user", +# "content": "how does a court case get to the Supreme Court?", +# }, +# ] +# response = completion( +# model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50 +# ) +# complete_response = "" +# has_finish_reason = False +# # Add any assertions here to check the response +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# has_finish_reason = finished +# if finished: +# break +# complete_response += chunk +# if has_finish_reason is False: +# raise Exception("finish reason not set for last chunk") +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# print(f"completion_response: {complete_response}") +# except InvalidRequestError as e: +# pass +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_vertexai_stream_bad_key() + +def test_completion_replicate_stream(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion( + model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50 + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except InvalidRequestError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +def test_completion_replicate_stream_bad_key(): + try: + api_key = "bad-key" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion( + model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50, api_key=api_key + ) + complete_response = "" + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except InvalidRequestError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +# test_completion_replicate_stream_bad_key() + +def test_completion_bedrock_ai21_stream(): + try: + response = completion( + model="bedrock/amazon.titan-tg1-large", + messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}], + temperature=1, + max_tokens=4096, + stream=True, + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + complete_response += chunk + if finished: + break + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +# test_completion_bedrock_ai21_stream() + +def test_completion_bedrock_ai21_stream_bad_key(): + try: + response = completion( + model="bedrock/amazon.titan-tg1-large", + messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}], + temperature=1, + max_tokens=4096, + stream=True, + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except InvalidRequestError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +# test_completion_bedrock_ai21_stream_bad_key() + +def test_completion_sagemaker_stream(): + try: + response = completion( + model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", + messages=messages, + temperature=0.2, + max_tokens=80, + stream=True, + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + except InvalidRequestError as e: + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +test_completion_sagemaker_stream() # test on openai completion call def test_openai_text_completion_call(): @@ -314,7 +700,33 @@ def test_openai_text_completion_call(): def ai21_completion_call(): try: response = completion( - model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn + model="j2-ultra", messages=messages, stream=True + ) + print(f"response: {response}") + has_finished = False + complete_response = "" + start_time = time.time() + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finished = finished + complete_response += chunk + if finished: + break + if has_finished is False: + raise Exception("finished reason missing from final chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except: + pytest.fail(f"error occurred: {traceback.format_exc()}") + +# ai21_completion_call() + +def ai21_completion_call_bad_key(): + try: + api_key = "bad-key" + response = completion( + model="j2-ultra", messages=messages, stream=True, api_key=api_key ) print(f"response: {response}") complete_response = "" @@ -327,10 +739,64 @@ def ai21_completion_call(): if complete_response.strip() == "": raise Exception("Empty response received") print(f"completion_response: {complete_response}") + except InvalidRequestError as e: + pass except: pytest.fail(f"error occurred: {traceback.format_exc()}") -# ai21_completion_call() +# ai21_completion_call_bad_key() + +def test_completion_aleph_alpha(): + try: + response = completion( + model="luminous-base", messages=messages, stream=True + ) + # Add any assertions here to check the response + has_finished = False + complete_response = "" + start_time = time.time() + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finished = finished + complete_response += chunk + if finished: + break + if has_finished is False: + raise Exception("finished reason missing from final chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + +# test_completion_aleph_alpha() + +# def test_completion_aleph_alpha_bad_key(): +# try: +# api_key = "bad-key" +# response = completion( +# model="luminous-base", messages=messages, stream=True, api_key=api_key +# ) +# # Add any assertions here to check the response +# has_finished = False +# complete_response = "" +# start_time = time.time() +# for idx, chunk in enumerate(response): +# chunk, finished = streaming_format_tests(idx, chunk) +# has_finished = finished +# complete_response += chunk +# if finished: +# break +# if has_finished is False: +# raise Exception("finished reason missing from final chunk") +# if complete_response.strip() == "": +# raise Exception("Empty response received") +# except InvalidRequestError as e: +# pass +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_aleph_alpha_bad_key() + # test on openai completion call def test_openai_chat_completion_call(): try: @@ -366,11 +832,15 @@ def test_together_ai_completion_call_starcoder(): ) complete_response = "" print(f"returned response object: {response}") + has_finish_reason = False for idx, chunk in enumerate(response): chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished if finished: break complete_response += chunk + if has_finish_reason is False: + raise Exception("Finish reason not set for last chunk") if complete_response == "": raise Exception("Empty response received") print(f"complete response: {complete_response}") @@ -378,6 +848,38 @@ def test_together_ai_completion_call_starcoder(): print(f"error occurred: {traceback.format_exc()}") pass +# test_together_ai_completion_call_starcoder() + +def test_together_ai_completion_call_starcoder_bad_key(): + try: + api_key = "bad-key" + start_time = time.time() + response = completion( + model="together_ai/bigcode/starcoder", + messages=messages, + stream=True, + api_key=api_key + ) + complete_response = "" + has_finish_reason = False + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("Finish reason not set for last chunk") + if complete_response == "": + raise Exception("Empty response received") + print(f"complete response: {complete_response}") + except InvalidRequestError as e: + pass + except: + print(f"error occurred: {traceback.format_exc()}") + pass + +# test_together_ai_completion_call_starcoder_bad_key() #### Test Function calling + streaming #### def test_completion_openai_with_functions(): diff --git a/litellm/utils.py b/litellm/utils.py index 0d2ce8c95..046c82cf1 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2,6 +2,7 @@ import sys import dotenv, json, traceback, threading import subprocess, os import litellm, openai +import itertools import random, uuid, requests import datetime, time import tiktoken @@ -1915,7 +1916,6 @@ def exception_type( ): global user_logger_fn, liteDebuggerClient exception_mapping_worked = False - if litellm.set_verbose == True: litellm.error_logs['EXCEPTION'] = original_exception litellm.error_logs['KWARGS'] = completion_kwargs @@ -1970,7 +1970,7 @@ def exception_type( exception_type = type(original_exception).__name__ else: exception_type = "" - if "claude" in model: # one of the anthropics + if custom_llm_provider == "anthropic": # one of the anthropics if hasattr(original_exception, "message"): if "prompt is too long" in original_exception.message: exception_mapping_worked = True @@ -1979,6 +1979,13 @@ def exception_type( model=model, llm_provider="anthropic" ) + if "Invalid API Key" in original_exception.message: + exception_mapping_worked = True + raise AuthenticationError( + message=original_exception.message, + model=model, + llm_provider="anthropic" + ) if hasattr(original_exception, "status_code"): print_verbose(f"status_code: {original_exception.status_code}") if original_exception.status_code == 401: @@ -2031,7 +2038,7 @@ def exception_type( llm_provider="anthropic", model=model ) - elif "replicate" in model: + elif custom_llm_provider == "replicate": if "Incorrect authentication token" in error_str: exception_mapping_worked = True raise AuthenticationError( @@ -2068,7 +2075,7 @@ def exception_type( llm_provider="replicate", model=model ) - elif original_exception.status_code == 400: + elif original_exception.status_code == 400 or original_exception.status_code == 422: exception_mapping_worked = True raise InvalidRequestError( message=f"ReplicateException - {original_exception.message}", @@ -2110,7 +2117,31 @@ def exception_type( llm_provider="replicate", model=model ) - elif model in litellm.cohere_models or custom_llm_provider == "cohere": # Cohere + elif custom_llm_provider == "bedrock": + if "Unable to locate credentials" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"BedrockException - {error_str}", + model=model, + llm_provider="bedrock" + ) + elif custom_llm_provider == "sagemaker": + if "Unable to locate credentials" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"SagemakerException - {error_str}", + model=model, + llm_provider="sagemaker" + ) + elif custom_llm_provider == "vertex_ai": + if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"VertexAIException - {error_str}", + model=model, + llm_provider="vertex_ai" + ) + elif custom_llm_provider == "cohere": # Cohere if ( "invalid api token" in error_str or "No API key provided." in error_str @@ -2184,6 +2215,13 @@ def exception_type( model=model, llm_provider="huggingface" ) + elif "A valid user token is required" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=error_str, + llm_provider="huggingface", + model=model + ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: exception_mapping_worked = True @@ -2221,6 +2259,8 @@ def exception_type( llm_provider="huggingface", model=model ) + exception_mapping_worked = True + raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider) elif custom_llm_provider == "ai21": if hasattr(original_exception, "message"): if "Prompt has too many tokens" in original_exception.message: @@ -2230,6 +2270,13 @@ def exception_type( model=model, llm_provider="ai21" ) + if "Bad or missing API token." in original_exception.message: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"AI21Exception - {original_exception.message}", + model=model, + llm_provider="ai21" + ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: exception_mapping_worked = True @@ -2266,7 +2313,7 @@ def exception_type( llm_provider="ai21", model=model ) - elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud": + elif custom_llm_provider == "nlp_cloud": if "detail" in error_str: if "Input text length should not exceed" in error_str: exception_mapping_worked = True @@ -2342,6 +2389,7 @@ def exception_type( model=model ) elif custom_llm_provider == "together_ai": + import json error_response = json.loads(error_str) if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]: exception_mapping_worked = True @@ -2364,6 +2412,13 @@ def exception_type( model=model, llm_provider="together_ai" ) + elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"TogetherAIException - {error_response['error']}", + model=model, + llm_provider="together_ai" + ) elif "error_type" in error_response and error_response["error_type"] == "validation": exception_mapping_worked = True raise InvalidRequestError( @@ -2393,7 +2448,7 @@ def exception_type( llm_provider="together_ai", model=model ) - elif model in litellm.aleph_alpha_models: + elif custom_llm_provider == "aleph_alpha": if "This is longer than the model's maximum context length" in error_str: exception_mapping_worked = True raise ContextWindowExceededError( @@ -2401,6 +2456,13 @@ def exception_type( llm_provider="aleph_alpha", model=model ) + elif "InvalidToken" in error_str or "No token provided" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"AlephAlphaException - {original_exception.message}", + llm_provider="aleph_alpha", + model=model + ) elif hasattr(original_exception, "status_code"): print(f"status code: {original_exception.status_code}") if original_exception.status_code == 401: @@ -2445,7 +2507,8 @@ def exception_type( elif custom_llm_provider == "ollama": if "no attribute 'async_get_ollama_response_stream" in error_str: raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'") - raise original_exception + exception_mapping_worked = True + raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model) except Exception as e: # LOGGING exception_logging( @@ -2563,6 +2626,7 @@ class CustomStreamWrapper: self.logging_obj = logging_obj self.completion_stream = completion_stream self.sent_first_chunk = False + self.sent_last_chunk = False if self.logging_obj: # Log the type of the received item self.logging_obj.post_call(str(type(completion_stream))) @@ -2579,41 +2643,71 @@ class CustomStreamWrapper: def handle_anthropic_chunk(self, chunk): str_line = chunk.decode("utf-8") # Convert bytes to string + print(f"str_line: {str_line}") + text = "" + is_finished = False + finish_reason = None if str_line.startswith("data:"): data_json = json.loads(str_line[5:]) - return data_json.get("completion", "") - return "" + text = data_json.get("completion", "") + if data_json.get("stop_reason", None): + is_finished = True + finish_reason = data_json["stop_reason"] + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} + elif "error" in str_line: + raise ValueError(f"Unable to parse response. Original response: {str_line}") + else: + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} def handle_together_ai_chunk(self, chunk): chunk = chunk.decode("utf-8") - text_index = chunk.find('"text":"') # this checks if text: exists - text_start = text_index + len('"text":"') - text_end = chunk.find('"}', text_start) - if text_index != -1 and text_end != -1: - extracted_text = chunk[text_start:text_end] - return extracted_text + text = "" + is_finished = False + finish_reason = None + if "text" in chunk: + text_index = chunk.find('"text":"') # this checks if text: exists + text_start = text_index + len('"text":"') + text_end = chunk.find('"}', text_start) + if text_index != -1 and text_end != -1: + extracted_text = chunk[text_start:text_end] + text = extracted_text + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} + elif "[DONE]" in chunk: + return {"text": text, "is_finished": True, "finish_reason": "stop"} + elif "error" in chunk: + raise ValueError(chunk) else: - return "" + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} def handle_huggingface_chunk(self, chunk): chunk = chunk.decode("utf-8") + text = "" + is_finished = False + finish_reason = "" if chunk.startswith("data:"): data_json = json.loads(chunk[5:]) + print(f"data json: {data_json}") if "token" in data_json and "text" in data_json["token"]: text = data_json["token"]["text"] if "meta-llama/Llama-2" in self.model: #clean eos tokens like from the returned output text if any(token in text for token in llama_2_special_tokens): text = text.replace("", "").replace("", "") - return text - else: - return "" - return "" + if data_json.get("details", False) and data_json["details"].get("finish_reason", False): + is_finished = True + finish_reason = data_json["details"]["finish_reason"] + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} + elif "error" in chunk: + raise ValueError(chunk) + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} - def handle_ai21_chunk(self, chunk): + def handle_ai21_chunk(self, chunk): # fake streaming chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - return data_json["completions"][0]["data"]["text"] + text = data_json["completions"][0]["data"]["text"] + is_finished = True + finish_reason = "stop" + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2621,8 +2715,10 @@ class CustomStreamWrapper: chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - print(f"data json: {data_json}") - return data_json["generated_text"] + text = data_json["generated_text"] + is_finished = True + finish_reason = "stop" + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2630,7 +2726,10 @@ class CustomStreamWrapper: chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - return data_json["completions"][0]["completion"] + text = data_json["completions"][0]["completion"] + is_finished = True + finish_reason = "stop" + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2638,7 +2737,35 @@ class CustomStreamWrapper: chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - return data_json["text"] + text = "" + is_finished = False + finish_reason = "" + if "text" in data_json: + text = data_json["text"] + elif "is_finished" in data_json: + is_finished = data_json["is_finished"] + finish_reason = data_json["finish_reason"] + else: + raise Exception(data_json) + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} + except: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_replicate_chunk(self, chunk): + print(f"chunk: {chunk}") + try: + text = "" + is_finished = False + finish_reason = "" + if "output" in chunk: + text = chunk['output'] + if "status" in chunk: + if chunk["status"] == "succeeded": + is_finished = True + finish_reason = "stop" + elif chunk.get("error", None): + raise Exception(chunk["error"]) + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2683,13 +2810,21 @@ class CustomStreamWrapper: traceback.print_exc() return "" - def handle_bedrock_stream(self): - if self.completion_stream: - event = next(self.completion_stream) - chunk = event.get('chunk') - if chunk: - chunk_data = json.loads(chunk.get('bytes').decode()) - return chunk_data['outputText'] + def handle_bedrock_stream(self, chunk): + chunk = chunk.get('chunk') + if chunk: + chunk_data = json.loads(chunk.get('bytes').decode()) + text = "" + is_finished = False + finish_reason = "" + if "outputText" in chunk_data: + text = chunk_data['outputText'] + if chunk_data.get("completionReason", None): + is_finished = True + finish_reason = chunk_data["completionReason"] + elif chunk.get("error", None): + raise Exception(chunk["error"]) + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} return "" ## needs to handle the empty string case (even starting chunk can be an empty string) @@ -2701,49 +2836,94 @@ class CustomStreamWrapper: completion_obj = {"content": ""} if self.custom_llm_provider and self.custom_llm_provider == "anthropic": chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_anthropic_chunk(chunk) + response_obj = self.handle_anthropic_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.model == "replicate" or self.custom_llm_provider == "replicate": chunk = next(self.completion_stream) - completion_obj["content"] = chunk + response_obj = self.handle_replicate_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif ( self.custom_llm_provider and self.custom_llm_provider == "together_ai"): chunk = next(self.completion_stream) - text_data = self.handle_together_ai_chunk(chunk) - if text_data == "": - return self.__next__() - completion_obj["content"] = text_data + response_obj = self.handle_together_ai_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_huggingface_chunk(chunk) + response_obj = self.handle_huggingface_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming chunk = next(self.completion_stream) completion_obj["content"] = self.handle_baseten_chunk(chunk) elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_ai21_chunk(chunk) + response_obj = self.handle_ai21_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider and self.custom_llm_provider == "vllm": chunk = next(self.completion_stream) completion_obj["content"] = chunk[0].outputs[0].text - elif self.custom_llm_provider and self.custom_llm_provider == "aleph-alpha": #aleph alpha doesn't provide streaming + elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk) + response_obj = self.handle_aleph_alpha_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai": chunk = next(self.completion_stream) completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk) elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud": - chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk) - elif self.model in (litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models): - chunk = next(self.completion_stream) - completion_obj["content"] = str(chunk) + try: + chunk = next(self.completion_stream) + response_obj = self.handle_nlp_cloud_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] + except Exception as e: + if self.sent_last_chunk: + raise e + else: + if self.sent_first_chunk is False: + raise Exception("An unknown error occurred with the stream") + model_response.choices[0].finish_reason = "stop" + self.sent_last_chunk = True + elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai": + try: + chunk = next(self.completion_stream) + completion_obj["content"] = str(chunk) + except StopIteration as e: + if self.sent_last_chunk: + raise e + else: + model_response.choices[0].finish_reason = "stop" + self.sent_last_chunk = True elif self.custom_llm_provider == "cohere": chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_cohere_chunk(chunk) + response_obj = self.handle_cohere_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider == "bedrock": - completion_obj["content"] = self.handle_bedrock_stream() + chunk = next(self.completion_stream) + response_obj = self.handle_bedrock_stream(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj["finish_reason"] elif self.custom_llm_provider == "sagemaker": if len(self.completion_stream)==0: - raise StopIteration + if self.sent_last_chunk: + raise StopIteration + else: + model_response.choices[0].finish_reason = "stop" + self.sent_last_chunk = True chunk_size = 30 new_chunk = self.completion_stream[:chunk_size] completion_obj["content"] = new_chunk @@ -2765,11 +2945,13 @@ class CustomStreamWrapper: self.sent_first_chunk = True model_response.choices[0].delta = Delta(**completion_obj) return model_response + elif model_response.choices[0].finish_reason: + return model_response except StopIteration: raise StopIteration - except Exception as e: - model_response.choices[0].finish_reason = "stop" - return model_response + except Exception as e: + e.message = str(e) + return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e) async def __anext__(self): try: @@ -2796,7 +2978,6 @@ def read_config_args(config_path) -> dict: # read keys/ values from config file and return them return config except Exception as e: - print("An error occurred while reading config:", str(e)) raise e ########## experimental completion variants ############################ @@ -2899,7 +3080,6 @@ def get_model_split_test(models, completion_call_id): try: # make the api call last_fetched_at = time.time() - print(f"last_fetched_at: {last_fetched_at}") response = requests.post( #http://api.litellm.ai url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict diff --git a/pyproject.toml b/pyproject.toml index bf0f0097c..80b3eb99d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.738" +version = "0.1.739" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"