fix exception mapping for streaming

This commit is contained in:
Krrish Dholakia 2023-09-23 15:04:34 -07:00
parent f984e5f380
commit 889679a0dd
8 changed files with 766 additions and 100 deletions

View file

@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos
}
status = ""
while True and (status not in ["succeeded", "failed", "canceled"]):
time.sleep(0.0001)
time.sleep(0.0001) # prevent being rate limited by replicate
response = requests.get(prediction_url, headers=headers)
if response.status_code == 200:
response_data = response.json()
status = response_data['status']
print(f"response data: {response_data}")
if "output" in response_data:
output_string = "".join(response_data['output'])
new_output = output_string[len(previous_output):]
yield new_output
yield {"output": new_output, "status": status}
previous_output = output_string
status = response_data['status']

View file

@ -485,11 +485,11 @@ def completion(
# Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
replicate_key = None
replicate_key = (
get_secret("REPLICATE_API_KEY")
or get_secret("REPLICATE_API_TOKEN")
or api_key
api_key
or litellm.replicate_key
or litellm.api_key
or litellm.api_key
or get_secret("REPLICATE_API_KEY")
or get_secret("REPLICATE_API_TOKEN")
)
model_response = replicate.completion(
@ -575,7 +575,7 @@ def completion(
if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object,
response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging)
response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
return response
response = model_response
elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@ -769,7 +769,7 @@ def completion(
if stream:
model_response = chat.send_message_streaming(prompt, **optional_params)
response = CustomStreamWrapper(
model_response, model, custom_llm_provider="vertexai", logging_obj=logging
model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
)
return response

View file

@ -643,24 +643,6 @@ def test_completion_sagemaker():
# test_completion_sagemaker()
def test_completion_sagemaker_stream():
litellm.set_verbose = False
try:
response = completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=messages,
temperature=0.2,
max_tokens=80,
stream=True,
)
# Add any assertions here to check the response
for chunk in response:
print(chunk)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_sagemaker_stream()
def test_completion_bedrock_titan():
try:
response = completion(

View file

@ -9,7 +9,7 @@ sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from litellm import completion, acompletion
from litellm import completion, acompletion, AuthenticationError, InvalidRequestError
litellm.logging = False
litellm.set_verbose = False
@ -187,6 +187,7 @@ def streaming_format_tests(idx, chunk):
finished = True
if "content" in chunk["choices"][0]["delta"]:
extracted_chunk = chunk["choices"][0]["delta"]["content"]
print(f"extracted chunk: {extracted_chunk}")
return extracted_chunk, finished
def test_completion_cohere_stream():
@ -199,21 +200,120 @@ def test_completion_cohere_stream():
},
]
response = completion(
model="command-nightly", messages=messages, stream=True, max_tokens=50
model="command-nightly", messages=messages, stream=True, max_tokens=50,
)
complete_response = ""
# Add any assertions here to check the response
has_finish_reason = False
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("Finish reason not in final chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_cohere_stream()
def test_completion_cohere_stream_bad_key():
try:
api_key = "bad-key"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="command-nightly", messages=messages, stream=True, max_tokens=50, api_key=api_key
)
complete_response = ""
# Add any assertions here to check the response
has_finish_reason = False
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("Finish reason not in final chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except AuthenticationError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_cohere_stream_bad_key()
# def test_completion_nlp_cloud():
# try:
# messages = [
# {"role": "system", "content": "You are a helpful assistant."},
# {
# "role": "user",
# "content": "how does a court case get to the Supreme Court?",
# },
# ]
# response = completion(model="dolphin", messages=messages, stream=True)
# complete_response = ""
# # Add any assertions here to check the response
# has_finish_reason = False
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# has_finish_reason = finished
# complete_response += chunk
# if finished:
# break
# if has_finish_reason is False:
# raise Exception("Finish reason not in final chunk")
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_nlp_cloud()
# def test_completion_nlp_cloud_bad_key():
# try:
# api_key = "bad-key"
# messages = [
# {"role": "system", "content": "You are a helpful assistant."},
# {
# "role": "user",
# "content": "how does a court case get to the Supreme Court?",
# },
# ]
# response = completion(model="dolphin", messages=messages, stream=True, api_key=api_key)
# complete_response = ""
# # Add any assertions here to check the response
# has_finish_reason = False
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# has_finish_reason = finished
# complete_response += chunk
# if finished:
# break
# if has_finish_reason is False:
# raise Exception("Finish reason not in final chunk")
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_nlp_cloud_bad_key()
# def test_completion_hf_stream():
# try:
# messages = [
@ -235,10 +335,41 @@ def test_completion_cohere_stream():
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except InvalidRequestError as e:
# pass
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_hf_stream()
# # test_completion_hf_stream()
# def test_completion_hf_stream_bad_key():
# try:
# api_key = "bad-key"
# messages = [
# {
# "content": "Hello! How are you today?",
# "role": "user"
# },
# ]
# response = completion(
# model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base="https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud", stream=True, max_tokens=1000, api_key=api_key
# )
# complete_response = ""
# # Add any assertions here to check the response
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# if finished:
# break
# complete_response += chunk
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except InvalidRequestError as e:
# pass
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_hf_stream_bad_key()
def test_completion_claude_stream():
try:
@ -266,19 +397,22 @@ def test_completion_claude_stream():
pytest.fail(f"Error occurred: {e}")
# test_completion_claude_stream()
def test_completion_bedrock_ai21_stream():
def test_completion_claude_stream_bad_key():
try:
litellm.set_verbose = False
api_key = "bad-key"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="bedrock/amazon.titan-tg1-large",
messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
temperature=1,
max_tokens=4096,
stream=True,
model="claude-instant-1", messages=messages, stream=True, max_tokens=50, api_key=api_key
)
complete_response = ""
# Add any assertions here to check the response
print(response)
complete_response = ""
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
if finished:
@ -286,11 +420,263 @@ def test_completion_bedrock_ai21_stream():
complete_response += chunk
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_cohere_stream()
# test_completion_claude_stream_bad_key()
def test_completion_replicate_stream():
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except InvalidRequestError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_replicate_stream()
# def test_completion_vertexai_stream():
# try:
# import os
# os.environ["VERTEXAI_PROJECT"] = "pathrise-convert-1606954137718"
# os.environ["VERTEXAI_LOCATION"] = "us-central1"
# messages = [
# {"role": "system", "content": "You are a helpful assistant."},
# {
# "role": "user",
# "content": "how does a court case get to the Supreme Court?",
# },
# ]
# response = completion(
# model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
# )
# complete_response = ""
# has_finish_reason = False
# # Add any assertions here to check the response
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# has_finish_reason = finished
# if finished:
# break
# complete_response += chunk
# if has_finish_reason is False:
# raise Exception("finish reason not set for last chunk")
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except InvalidRequestError as e:
# pass
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_vertexai_stream()
# def test_completion_vertexai_stream_bad_key():
# try:
# import os
# messages = [
# {"role": "system", "content": "You are a helpful assistant."},
# {
# "role": "user",
# "content": "how does a court case get to the Supreme Court?",
# },
# ]
# response = completion(
# model="vertex_ai/chat-bison", messages=messages, stream=True, max_tokens=50
# )
# complete_response = ""
# has_finish_reason = False
# # Add any assertions here to check the response
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# has_finish_reason = finished
# if finished:
# break
# complete_response += chunk
# if has_finish_reason is False:
# raise Exception("finish reason not set for last chunk")
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# print(f"completion_response: {complete_response}")
# except InvalidRequestError as e:
# pass
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_vertexai_stream_bad_key()
def test_completion_replicate_stream():
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except InvalidRequestError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_replicate_stream_bad_key():
try:
api_key = "bad-key"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", messages=messages, stream=True, max_tokens=50, api_key=api_key
)
complete_response = ""
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
if finished:
break
complete_response += chunk
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except InvalidRequestError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_replicate_stream_bad_key()
def test_completion_bedrock_ai21_stream():
try:
response = completion(
model="bedrock/amazon.titan-tg1-large",
messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
temperature=1,
max_tokens=4096,
stream=True,
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
complete_response += chunk
if finished:
break
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_bedrock_ai21_stream()
def test_completion_bedrock_ai21_stream_bad_key():
try:
response = completion(
model="bedrock/amazon.titan-tg1-large",
messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}],
temperature=1,
max_tokens=4096,
stream=True,
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except InvalidRequestError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_bedrock_ai21_stream_bad_key()
def test_completion_sagemaker_stream():
try:
response = completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=messages,
temperature=0.2,
max_tokens=80,
stream=True,
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
except InvalidRequestError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
test_completion_sagemaker_stream()
# test on openai completion call
def test_openai_text_completion_call():
@ -314,7 +700,33 @@ def test_openai_text_completion_call():
def ai21_completion_call():
try:
response = completion(
model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn
model="j2-ultra", messages=messages, stream=True
)
print(f"response: {response}")
has_finished = False
complete_response = ""
start_time = time.time()
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finished = finished
complete_response += chunk
if finished:
break
if has_finished is False:
raise Exception("finished reason missing from final chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except:
pytest.fail(f"error occurred: {traceback.format_exc()}")
# ai21_completion_call()
def ai21_completion_call_bad_key():
try:
api_key = "bad-key"
response = completion(
model="j2-ultra", messages=messages, stream=True, api_key=api_key
)
print(f"response: {response}")
complete_response = ""
@ -327,10 +739,64 @@ def ai21_completion_call():
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except InvalidRequestError as e:
pass
except:
pytest.fail(f"error occurred: {traceback.format_exc()}")
# ai21_completion_call()
# ai21_completion_call_bad_key()
def test_completion_aleph_alpha():
try:
response = completion(
model="luminous-base", messages=messages, stream=True
)
# Add any assertions here to check the response
has_finished = False
complete_response = ""
start_time = time.time()
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finished = finished
complete_response += chunk
if finished:
break
if has_finished is False:
raise Exception("finished reason missing from final chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_aleph_alpha()
# def test_completion_aleph_alpha_bad_key():
# try:
# api_key = "bad-key"
# response = completion(
# model="luminous-base", messages=messages, stream=True, api_key=api_key
# )
# # Add any assertions here to check the response
# has_finished = False
# complete_response = ""
# start_time = time.time()
# for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk)
# has_finished = finished
# complete_response += chunk
# if finished:
# break
# if has_finished is False:
# raise Exception("finished reason missing from final chunk")
# if complete_response.strip() == "":
# raise Exception("Empty response received")
# except InvalidRequestError as e:
# pass
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_aleph_alpha_bad_key()
# test on openai completion call
def test_openai_chat_completion_call():
try:
@ -366,11 +832,15 @@ def test_together_ai_completion_call_starcoder():
)
complete_response = ""
print(f"returned response object: {response}")
has_finish_reason = False
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("Finish reason not set for last chunk")
if complete_response == "":
raise Exception("Empty response received")
print(f"complete response: {complete_response}")
@ -378,6 +848,38 @@ def test_together_ai_completion_call_starcoder():
print(f"error occurred: {traceback.format_exc()}")
pass
# test_together_ai_completion_call_starcoder()
def test_together_ai_completion_call_starcoder_bad_key():
try:
api_key = "bad-key"
start_time = time.time()
response = completion(
model="together_ai/bigcode/starcoder",
messages=messages,
stream=True,
api_key=api_key
)
complete_response = ""
has_finish_reason = False
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("Finish reason not set for last chunk")
if complete_response == "":
raise Exception("Empty response received")
print(f"complete response: {complete_response}")
except InvalidRequestError as e:
pass
except:
print(f"error occurred: {traceback.format_exc()}")
pass
# test_together_ai_completion_call_starcoder_bad_key()
#### Test Function calling + streaming ####
def test_completion_openai_with_functions():

View file

@ -2,6 +2,7 @@ import sys
import dotenv, json, traceback, threading
import subprocess, os
import litellm, openai
import itertools
import random, uuid, requests
import datetime, time
import tiktoken
@ -1915,7 +1916,6 @@ def exception_type(
):
global user_logger_fn, liteDebuggerClient
exception_mapping_worked = False
if litellm.set_verbose == True:
litellm.error_logs['EXCEPTION'] = original_exception
litellm.error_logs['KWARGS'] = completion_kwargs
@ -1970,7 +1970,7 @@ def exception_type(
exception_type = type(original_exception).__name__
else:
exception_type = ""
if "claude" in model: # one of the anthropics
if custom_llm_provider == "anthropic": # one of the anthropics
if hasattr(original_exception, "message"):
if "prompt is too long" in original_exception.message:
exception_mapping_worked = True
@ -1979,6 +1979,13 @@ def exception_type(
model=model,
llm_provider="anthropic"
)
if "Invalid API Key" in original_exception.message:
exception_mapping_worked = True
raise AuthenticationError(
message=original_exception.message,
model=model,
llm_provider="anthropic"
)
if hasattr(original_exception, "status_code"):
print_verbose(f"status_code: {original_exception.status_code}")
if original_exception.status_code == 401:
@ -2031,7 +2038,7 @@ def exception_type(
llm_provider="anthropic",
model=model
)
elif "replicate" in model:
elif custom_llm_provider == "replicate":
if "Incorrect authentication token" in error_str:
exception_mapping_worked = True
raise AuthenticationError(
@ -2068,7 +2075,7 @@ def exception_type(
llm_provider="replicate",
model=model
)
elif original_exception.status_code == 400:
elif original_exception.status_code == 400 or original_exception.status_code == 422:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"ReplicateException - {original_exception.message}",
@ -2110,7 +2117,31 @@ def exception_type(
llm_provider="replicate",
model=model
)
elif model in litellm.cohere_models or custom_llm_provider == "cohere": # Cohere
elif custom_llm_provider == "bedrock":
if "Unable to locate credentials" in error_str:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"BedrockException - {error_str}",
model=model,
llm_provider="bedrock"
)
elif custom_llm_provider == "sagemaker":
if "Unable to locate credentials" in error_str:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"SagemakerException - {error_str}",
model=model,
llm_provider="sagemaker"
)
elif custom_llm_provider == "vertex_ai":
if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"VertexAIException - {error_str}",
model=model,
llm_provider="vertex_ai"
)
elif custom_llm_provider == "cohere": # Cohere
if (
"invalid api token" in error_str
or "No API key provided." in error_str
@ -2184,6 +2215,13 @@ def exception_type(
model=model,
llm_provider="huggingface"
)
elif "A valid user token is required" in error_str:
exception_mapping_worked = True
raise InvalidRequestError(
message=error_str,
llm_provider="huggingface",
model=model
)
if hasattr(original_exception, "status_code"):
if original_exception.status_code == 401:
exception_mapping_worked = True
@ -2221,6 +2259,8 @@ def exception_type(
llm_provider="huggingface",
model=model
)
exception_mapping_worked = True
raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
elif custom_llm_provider == "ai21":
if hasattr(original_exception, "message"):
if "Prompt has too many tokens" in original_exception.message:
@ -2230,6 +2270,13 @@ def exception_type(
model=model,
llm_provider="ai21"
)
if "Bad or missing API token." in original_exception.message:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"AI21Exception - {original_exception.message}",
model=model,
llm_provider="ai21"
)
if hasattr(original_exception, "status_code"):
if original_exception.status_code == 401:
exception_mapping_worked = True
@ -2266,7 +2313,7 @@ def exception_type(
llm_provider="ai21",
model=model
)
elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
elif custom_llm_provider == "nlp_cloud":
if "detail" in error_str:
if "Input text length should not exceed" in error_str:
exception_mapping_worked = True
@ -2342,6 +2389,7 @@ def exception_type(
model=model
)
elif custom_llm_provider == "together_ai":
import json
error_response = json.loads(error_str)
if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
exception_mapping_worked = True
@ -2364,6 +2412,13 @@ def exception_type(
model=model,
llm_provider="together_ai"
)
elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai"
)
elif "error_type" in error_response and error_response["error_type"] == "validation":
exception_mapping_worked = True
raise InvalidRequestError(
@ -2393,7 +2448,7 @@ def exception_type(
llm_provider="together_ai",
model=model
)
elif model in litellm.aleph_alpha_models:
elif custom_llm_provider == "aleph_alpha":
if "This is longer than the model's maximum context length" in error_str:
exception_mapping_worked = True
raise ContextWindowExceededError(
@ -2401,6 +2456,13 @@ def exception_type(
llm_provider="aleph_alpha",
model=model
)
elif "InvalidToken" in error_str or "No token provided" in error_str:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model
)
elif hasattr(original_exception, "status_code"):
print(f"status code: {original_exception.status_code}")
if original_exception.status_code == 401:
@ -2445,7 +2507,8 @@ def exception_type(
elif custom_llm_provider == "ollama":
if "no attribute 'async_get_ollama_response_stream" in error_str:
raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
raise original_exception
exception_mapping_worked = True
raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
except Exception as e:
# LOGGING
exception_logging(
@ -2563,6 +2626,7 @@ class CustomStreamWrapper:
self.logging_obj = logging_obj
self.completion_stream = completion_stream
self.sent_first_chunk = False
self.sent_last_chunk = False
if self.logging_obj:
# Log the type of the received item
self.logging_obj.post_call(str(type(completion_stream)))
@ -2579,41 +2643,71 @@ class CustomStreamWrapper:
def handle_anthropic_chunk(self, chunk):
str_line = chunk.decode("utf-8") # Convert bytes to string
print(f"str_line: {str_line}")
text = ""
is_finished = False
finish_reason = None
if str_line.startswith("data:"):
data_json = json.loads(str_line[5:])
return data_json.get("completion", "")
return ""
text = data_json.get("completion", "")
if data_json.get("stop_reason", None):
is_finished = True
finish_reason = data_json["stop_reason"]
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
elif "error" in str_line:
raise ValueError(f"Unable to parse response. Original response: {str_line}")
else:
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
def handle_together_ai_chunk(self, chunk):
chunk = chunk.decode("utf-8")
text_index = chunk.find('"text":"') # this checks if text: exists
text_start = text_index + len('"text":"')
text_end = chunk.find('"}', text_start)
if text_index != -1 and text_end != -1:
extracted_text = chunk[text_start:text_end]
return extracted_text
text = ""
is_finished = False
finish_reason = None
if "text" in chunk:
text_index = chunk.find('"text":"') # this checks if text: exists
text_start = text_index + len('"text":"')
text_end = chunk.find('"}', text_start)
if text_index != -1 and text_end != -1:
extracted_text = chunk[text_start:text_end]
text = extracted_text
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
elif "[DONE]" in chunk:
return {"text": text, "is_finished": True, "finish_reason": "stop"}
elif "error" in chunk:
raise ValueError(chunk)
else:
return ""
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
def handle_huggingface_chunk(self, chunk):
chunk = chunk.decode("utf-8")
text = ""
is_finished = False
finish_reason = ""
if chunk.startswith("data:"):
data_json = json.loads(chunk[5:])
print(f"data json: {data_json}")
if "token" in data_json and "text" in data_json["token"]:
text = data_json["token"]["text"]
if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
if any(token in text for token in llama_2_special_tokens):
text = text.replace("<s>", "").replace("</s>", "")
return text
else:
return ""
return ""
if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
is_finished = True
finish_reason = data_json["details"]["finish_reason"]
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
elif "error" in chunk:
raise ValueError(chunk)
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
def handle_ai21_chunk(self, chunk):
def handle_ai21_chunk(self, chunk): # fake streaming
chunk = chunk.decode("utf-8")
data_json = json.loads(chunk)
try:
return data_json["completions"][0]["data"]["text"]
text = data_json["completions"][0]["data"]["text"]
is_finished = True
finish_reason = "stop"
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2621,8 +2715,10 @@ class CustomStreamWrapper:
chunk = chunk.decode("utf-8")
data_json = json.loads(chunk)
try:
print(f"data json: {data_json}")
return data_json["generated_text"]
text = data_json["generated_text"]
is_finished = True
finish_reason = "stop"
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2630,7 +2726,10 @@ class CustomStreamWrapper:
chunk = chunk.decode("utf-8")
data_json = json.loads(chunk)
try:
return data_json["completions"][0]["completion"]
text = data_json["completions"][0]["completion"]
is_finished = True
finish_reason = "stop"
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2638,7 +2737,35 @@ class CustomStreamWrapper:
chunk = chunk.decode("utf-8")
data_json = json.loads(chunk)
try:
return data_json["text"]
text = ""
is_finished = False
finish_reason = ""
if "text" in data_json:
text = data_json["text"]
elif "is_finished" in data_json:
is_finished = data_json["is_finished"]
finish_reason = data_json["finish_reason"]
else:
raise Exception(data_json)
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
def handle_replicate_chunk(self, chunk):
print(f"chunk: {chunk}")
try:
text = ""
is_finished = False
finish_reason = ""
if "output" in chunk:
text = chunk['output']
if "status" in chunk:
if chunk["status"] == "succeeded":
is_finished = True
finish_reason = "stop"
elif chunk.get("error", None):
raise Exception(chunk["error"])
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2683,13 +2810,21 @@ class CustomStreamWrapper:
traceback.print_exc()
return ""
def handle_bedrock_stream(self):
if self.completion_stream:
event = next(self.completion_stream)
chunk = event.get('chunk')
if chunk:
chunk_data = json.loads(chunk.get('bytes').decode())
return chunk_data['outputText']
def handle_bedrock_stream(self, chunk):
chunk = chunk.get('chunk')
if chunk:
chunk_data = json.loads(chunk.get('bytes').decode())
text = ""
is_finished = False
finish_reason = ""
if "outputText" in chunk_data:
text = chunk_data['outputText']
if chunk_data.get("completionReason", None):
is_finished = True
finish_reason = chunk_data["completionReason"]
elif chunk.get("error", None):
raise Exception(chunk["error"])
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
return ""
## needs to handle the empty string case (even starting chunk can be an empty string)
@ -2701,49 +2836,94 @@ class CustomStreamWrapper:
completion_obj = {"content": ""}
if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_anthropic_chunk(chunk)
response_obj = self.handle_anthropic_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.model == "replicate" or self.custom_llm_provider == "replicate":
chunk = next(self.completion_stream)
completion_obj["content"] = chunk
response_obj = self.handle_replicate_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif (
self.custom_llm_provider and self.custom_llm_provider == "together_ai"):
chunk = next(self.completion_stream)
text_data = self.handle_together_ai_chunk(chunk)
if text_data == "":
return self.__next__()
completion_obj["content"] = text_data
response_obj = self.handle_together_ai_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_huggingface_chunk(chunk)
response_obj = self.handle_huggingface_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_baseten_chunk(chunk)
elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_ai21_chunk(chunk)
response_obj = self.handle_ai21_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
chunk = next(self.completion_stream)
completion_obj["content"] = chunk[0].outputs[0].text
elif self.custom_llm_provider and self.custom_llm_provider == "aleph-alpha": #aleph alpha doesn't provide streaming
elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk)
response_obj = self.handle_aleph_alpha_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
elif self.model in (litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models):
chunk = next(self.completion_stream)
completion_obj["content"] = str(chunk)
try:
chunk = next(self.completion_stream)
response_obj = self.handle_nlp_cloud_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
except Exception as e:
if self.sent_last_chunk:
raise e
else:
if self.sent_first_chunk is False:
raise Exception("An unknown error occurred with the stream")
model_response.choices[0].finish_reason = "stop"
self.sent_last_chunk = True
elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
try:
chunk = next(self.completion_stream)
completion_obj["content"] = str(chunk)
except StopIteration as e:
if self.sent_last_chunk:
raise e
else:
model_response.choices[0].finish_reason = "stop"
self.sent_last_chunk = True
elif self.custom_llm_provider == "cohere":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_cohere_chunk(chunk)
response_obj = self.handle_cohere_chunk(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "bedrock":
completion_obj["content"] = self.handle_bedrock_stream()
chunk = next(self.completion_stream)
response_obj = self.handle_bedrock_stream(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
model_response.choices[0].finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "sagemaker":
if len(self.completion_stream)==0:
raise StopIteration
if self.sent_last_chunk:
raise StopIteration
else:
model_response.choices[0].finish_reason = "stop"
self.sent_last_chunk = True
chunk_size = 30
new_chunk = self.completion_stream[:chunk_size]
completion_obj["content"] = new_chunk
@ -2765,11 +2945,13 @@ class CustomStreamWrapper:
self.sent_first_chunk = True
model_response.choices[0].delta = Delta(**completion_obj)
return model_response
elif model_response.choices[0].finish_reason:
return model_response
except StopIteration:
raise StopIteration
except Exception as e:
model_response.choices[0].finish_reason = "stop"
return model_response
except Exception as e:
e.message = str(e)
return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e)
async def __anext__(self):
try:
@ -2796,7 +2978,6 @@ def read_config_args(config_path) -> dict:
# read keys/ values from config file and return them
return config
except Exception as e:
print("An error occurred while reading config:", str(e))
raise e
########## experimental completion variants ############################
@ -2899,7 +3080,6 @@ def get_model_split_test(models, completion_call_id):
try:
# make the api call
last_fetched_at = time.time()
print(f"last_fetched_at: {last_fetched_at}")
response = requests.post(
#http://api.litellm.ai
url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "0.1.738"
version = "0.1.739"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT License"