ensure streaming format is exactly the same as openai

This commit is contained in:
Krrish Dholakia 2023-09-16 10:34:20 -07:00
parent ebd4688fec
commit 21cd55ab26
6 changed files with 275 additions and 169 deletions

View file

@ -197,7 +197,7 @@ def completion(
completion_call_id=id completion_call_id=id
) )
logging.update_environment_variables(model=model, user=user, optional_params=optional_params, litellm_params=litellm_params) logging.update_environment_variables(model=model, user=user, optional_params=optional_params, litellm_params=litellm_params)
get_llm_provider(model=model, custom_llm_provider=custom_llm_provider) model, custom_llm_provider = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider)
if custom_llm_provider == "azure": if custom_llm_provider == "azure":
# azure configs # azure configs
api_type = get_secret("AZURE_API_TYPE") or "azure" api_type = get_secret("AZURE_API_TYPE") or "azure"

View file

@ -24,6 +24,170 @@ def logger_fn(model_call_object: dict):
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}] messages = [{"content": user_message, "role": "user"}]
first_openai_chunk_example = {
"id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
"object": "chat.completion.chunk",
"created": 1694881253,
"model": "gpt-4-0613",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"content": ""
},
"finish_reason": None # it's null
}
]
}
def validate_first_format(chunk):
# write a test to make sure chunk follows the same format as first_openai_chunk_example
assert isinstance(chunk, dict), "Chunk should be a dictionary."
assert "id" in chunk, "Chunk should have an 'id'."
assert isinstance(chunk['id'], str), "'id' should be a string."
assert "object" in chunk, "Chunk should have an 'object'."
assert isinstance(chunk['object'], str), "'object' should be a string."
assert "created" in chunk, "Chunk should have a 'created'."
assert isinstance(chunk['created'], int), "'created' should be an integer."
assert "model" in chunk, "Chunk should have a 'model'."
assert isinstance(chunk['model'], str), "'model' should be a string."
assert "choices" in chunk, "Chunk should have 'choices'."
assert isinstance(chunk['choices'], list), "'choices' should be a list."
for choice in chunk['choices']:
assert isinstance(choice, dict), "Each choice should be a dictionary."
assert "index" in choice, "Each choice should have 'index'."
assert isinstance(choice['index'], int), "'index' should be an integer."
assert "delta" in choice, "Each choice should have 'delta'."
assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
assert "role" in choice['delta'], "'delta' should have a 'role'."
assert isinstance(choice['delta']['role'], str), "'role' should be a string."
assert "content" in choice['delta'], "'delta' should have 'content'."
assert isinstance(choice['delta']['content'], str), "'content' should be a string."
assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
second_openai_chunk_example = {
"id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
"object": "chat.completion.chunk",
"created": 1694881253,
"model": "gpt-4-0613",
"choices": [
{
"index": 0,
"delta": {
"content": "Hello"
},
"finish_reason": None # it's null
}
]
}
def validate_second_format(chunk):
assert isinstance(chunk, dict), "Chunk should be a dictionary."
assert "id" in chunk, "Chunk should have an 'id'."
assert isinstance(chunk['id'], str), "'id' should be a string."
assert "object" in chunk, "Chunk should have an 'object'."
assert isinstance(chunk['object'], str), "'object' should be a string."
assert "created" in chunk, "Chunk should have a 'created'."
assert isinstance(chunk['created'], int), "'created' should be an integer."
assert "model" in chunk, "Chunk should have a 'model'."
assert isinstance(chunk['model'], str), "'model' should be a string."
assert "choices" in chunk, "Chunk should have 'choices'."
assert isinstance(chunk['choices'], list), "'choices' should be a list."
for choice in chunk['choices']:
assert isinstance(choice, dict), "Each choice should be a dictionary."
assert "index" in choice, "Each choice should have 'index'."
assert isinstance(choice['index'], int), "'index' should be an integer."
assert "delta" in choice, "Each choice should have 'delta'."
assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
assert "content" in choice['delta'], "'delta' should have 'content'."
assert isinstance(choice['delta']['content'], str), "'content' should be a string."
assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
last_openai_chunk_example = {
"id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
"object": "chat.completion.chunk",
"created": 1694881253,
"model": "gpt-4-0613",
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop"
}
]
}
def validate_last_format(chunk):
assert isinstance(chunk, dict), "Chunk should be a dictionary."
assert "id" in chunk, "Chunk should have an 'id'."
assert isinstance(chunk['id'], str), "'id' should be a string."
assert "object" in chunk, "Chunk should have an 'object'."
assert isinstance(chunk['object'], str), "'object' should be a string."
assert "created" in chunk, "Chunk should have a 'created'."
assert isinstance(chunk['created'], int), "'created' should be an integer."
assert "model" in chunk, "Chunk should have a 'model'."
assert isinstance(chunk['model'], str), "'model' should be a string."
assert "choices" in chunk, "Chunk should have 'choices'."
assert isinstance(chunk['choices'], list), "'choices' should be a list."
for choice in chunk['choices']:
assert isinstance(choice, dict), "Each choice should be a dictionary."
assert "index" in choice, "Each choice should have 'index'."
assert isinstance(choice['index'], int), "'index' should be an integer."
assert "delta" in choice, "Each choice should have 'delta'."
assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string."
def streaming_format_tests(idx, chunk):
extracted_chunk = ""
finished = False
if idx == 0: # ensure role assistant is set
validate_first_format(chunk=chunk)
role = chunk["choices"][0]["delta"]["role"]
assert role == "assistant"
elif idx == 1: # second chunk
validate_second_format(chunk=chunk)
if idx != 0: # ensure no role
if "role" in chunk["choices"][0]["delta"]:
raise Exception("role should not exist after first chunk")
if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk
validate_last_format(chunk=chunk)
finished = True
if "content" in chunk["choices"][0]["delta"]:
extracted_chunk = chunk["choices"][0]["delta"]["content"]
return extracted_chunk, finished
def test_completion_cohere_stream(): def test_completion_cohere_stream():
try: try:
messages = [ messages = [
@ -38,36 +202,18 @@ def test_completion_cohere_stream():
) )
complete_response = "" complete_response = ""
# Add any assertions here to check the response # Add any assertions here to check the response
for chunk in response: for idx, chunk in enumerate(response):
print(f"chunk: {chunk}") chunk, finished = streaming_format_tests(idx, chunk)
complete_response += chunk["choices"][0]["delta"]["content"] if finished:
if complete_response == "": break
complete_response += chunk
if complete_response.strip() == "":
raise Exception("Empty response received") raise Exception("Empty response received")
print(f"completion_response: {complete_response}") print(f"completion_response: {complete_response}")
except KeyError as e:
pass
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# test on baseten completion call # test_completion_cohere_stream()
# try:
# response = completion(
# model="baseten/RqgAEn0", messages=messages, logger_fn=logger_fn
# )
# print(f"response: {response}")
# complete_response = ""
# start_time = time.time()
# for chunk in response:
# chunk_time = time.time()
# print(f"time since initial request: {chunk_time - start_time:.5f}")
# print(chunk["choices"][0]["delta"])
# complete_response += chunk["choices"][0]["delta"]["content"]
# if complete_response == "":
# raise Exception("Empty response received")
# print(f"complete response: {complete_response}")
# except:
# print(f"error occurred: {traceback.format_exc()}")
# pass
# test on openai completion call # test on openai completion call
def test_openai_text_completion_call(): def test_openai_text_completion_call():
@ -77,16 +223,17 @@ def test_openai_text_completion_call():
) )
complete_response = "" complete_response = ""
start_time = time.time() start_time = time.time()
for chunk in response: for idx, chunk in enumerate(response):
chunk_time = time.time() chunk, finished = streaming_format_tests(idx, chunk)
print(f"chunk: {chunk}") if finished:
if "content" in chunk["choices"][0]["delta"]: break
complete_response += chunk["choices"][0]["delta"]["content"] complete_response += chunk
if complete_response == "": if complete_response.strip() == "":
raise Exception("Empty response received") raise Exception("Empty response received")
except: except:
print(f"error occurred: {traceback.format_exc()}") pytest.fail(f"error occurred: {traceback.format_exc()}")
pass
test_openai_text_completion_call()
# # test on ai21 completion call # # test on ai21 completion call
def ai21_completion_call(): def ai21_completion_call():
@ -97,18 +244,18 @@ def ai21_completion_call():
print(f"response: {response}") print(f"response: {response}")
complete_response = "" complete_response = ""
start_time = time.time() start_time = time.time()
for chunk in response: for idx, chunk in enumerate(response):
chunk_time = time.time() chunk, finished = streaming_format_tests(idx, chunk)
print(f"time since initial request: {chunk_time - start_time:.5f}") if finished:
print(chunk) break
if "content" in chunk["choices"][0]["delta"]: complete_response += chunk
complete_response += chunk["choices"][0]["delta"]["content"] if complete_response.strip() == "":
if complete_response == "":
raise Exception("Empty response received") raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except: except:
print(f"error occurred: {traceback.format_exc()}") pytest.fail(f"error occurred: {traceback.format_exc()}")
pass
# ai21_completion_call()
# test on openai completion call # test on openai completion call
def test_openai_chat_completion_call(): def test_openai_chat_completion_call():
try: try:
@ -117,107 +264,20 @@ def test_openai_chat_completion_call():
) )
complete_response = "" complete_response = ""
start_time = time.time() start_time = time.time()
for chunk in response: for idx, chunk in enumerate(response):
print(chunk) chunk, finished = streaming_format_tests(idx, chunk)
if chunk["choices"][0]["finish_reason"]: if finished:
break break
# if chunk["choices"][0]["delta"]["role"] != "assistant": complete_response += chunk
# raise Exception("invalid role")
if "content" in chunk["choices"][0]["delta"]:
complete_response += chunk["choices"][0]["delta"]["content"]
# print(f'complete_chunk: {complete_response}') # print(f'complete_chunk: {complete_response}')
if complete_response.strip() == "": if complete_response.strip() == "":
raise Exception("Empty response received") raise Exception("Empty response received")
print(f"complete response: {complete_response}")
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
test_openai_chat_completion_call() # test_openai_chat_completion_call()
async def completion_call():
try:
response = completion(
model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
)
print(f"response: {response}")
complete_response = ""
start_time = time.time()
# Change for loop to async for loop
async for chunk in response:
chunk_time = time.time()
print(f"time since initial request: {chunk_time - start_time:.5f}")
print(chunk["choices"][0]["delta"])
if "content" in chunk["choices"][0]["delta"]:
complete_response += chunk["choices"][0]["delta"]["content"]
if complete_response == "":
raise Exception("Empty response received")
except:
print(f"error occurred: {traceback.format_exc()}")
pass
# asyncio.run(completion_call())
# # test on azure completion call
# try:
# response = completion(
# model="azure/chatgpt-test", messages=messages, stream=True, logger_fn=logger_fn
# )
# response = ""
# start_time = time.time()
# for chunk in response:
# chunk_time = time.time()
# print(f"time since initial request: {chunk_time - start_time:.2f}")
# print(chunk["choices"][0]["delta"])
# response += chunk["choices"][0]["delta"]
# if response == "":
# raise Exception("Empty response received")
# except:
# print(f"error occurred: {traceback.format_exc()}")
# pass
# # test on huggingface completion call
# try:
# start_time = time.time()
# response = completion(
# model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
# )
# complete_response = ""
# for chunk in response:
# chunk_time = time.time()
# print(f"time since initial request: {chunk_time - start_time:.2f}")
# print(chunk["choices"][0]["delta"])
# complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else ""
# if complete_response == "":
# raise Exception("Empty response received")
# except:
# print(f"error occurred: {traceback.format_exc()}")
# pass
# test on together ai completion call - replit-code-3b
def test_together_ai_completion_call_replit():
try:
start_time = time.time()
response = completion(
model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream=True
)
complete_response = ""
print(f"returned response object: {response}")
for chunk in response:
chunk_time = time.time()
print(f"time since initial request: {chunk_time - start_time:.2f}")
print(chunk["choices"][0]["delta"])
complete_response += (
chunk["choices"][0]["delta"]["content"]
if len(chunk["choices"][0]["delta"].keys()) > 0
else ""
)
if complete_response == "":
raise Exception("Empty response received")
except KeyError as e:
pass
except:
print(f"error occurred: {traceback.format_exc()}")
pass
# # test on together ai completion call - starcoder # # test on together ai completion call - starcoder
def test_together_ai_completion_call_starcoder(): def test_together_ai_completion_call_starcoder():
@ -231,23 +291,18 @@ def test_together_ai_completion_call_starcoder():
) )
complete_response = "" complete_response = ""
print(f"returned response object: {response}") print(f"returned response object: {response}")
for chunk in response: for idx, chunk in enumerate(response):
chunk_time = time.time() chunk, finished = streaming_format_tests(idx, chunk)
complete_response += ( if finished:
chunk["choices"][0]["delta"]["content"] break
if len(chunk["choices"][0]["delta"].keys()) > 0 complete_response += chunk
else ""
)
if len(complete_response) > 0:
print(complete_response)
if complete_response == "": if complete_response == "":
raise Exception("Empty response received") raise Exception("Empty response received")
except KeyError as e: print(f"complete response: {complete_response}")
pass
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
# test_together_ai_completion_call_starcoder()
# test on aleph alpha completion call - commented out as it's expensive to run this on circle ci for every build # test on aleph alpha completion call - commented out as it's expensive to run this on circle ci for every build
# def test_aleph_alpha_call(): # def test_aleph_alpha_call():
# try: # try:
@ -286,13 +341,43 @@ async def ai21_async_completion_call():
complete_response = "" complete_response = ""
start_time = time.time() start_time = time.time()
# Change for loop to async for loop # Change for loop to async for loop
idx = 0
async for chunk in response: async for chunk in response:
chunk_time = time.time() chunk, finished = streaming_format_tests(idx, chunk)
print(f"time since initial request: {chunk_time - start_time:.5f}") if finished:
print(chunk["choices"][0]["delta"]) break
complete_response += chunk["choices"][0]["delta"]["content"] complete_response += chunk
if complete_response == "": idx += 1
if complete_response.strip() == "":
raise Exception("Empty response received") raise Exception("Empty response received")
print(f"complete response: {complete_response}")
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
# asyncio.run(ai21_async_completion_call())
async def completion_call():
try:
response = completion(
model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
)
print(f"response: {response}")
complete_response = ""
start_time = time.time()
# Change for loop to async for loop
idx = 0
async for chunk in response:
chunk, finished = streaming_format_tests(idx, chunk)
if finished:
break
complete_response += chunk
idx += 1
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"complete response: {complete_response}")
except:
print(f"error occurred: {traceback.format_exc()}")
pass
# asyncio.run(completion_call())

View file

@ -80,6 +80,8 @@ last_fetched_at_keys = None
# 'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41} # 'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}
# } # }
def _generate_id(): # private helper function
return 'chatcmpl-' + str(uuid.uuid4())
class Message(OpenAIObject): class Message(OpenAIObject):
def __init__(self, content="default", role="assistant", logprobs=None, **params): def __init__(self, content="default", role="assistant", logprobs=None, **params):
@ -89,9 +91,9 @@ class Message(OpenAIObject):
self.logprobs = logprobs self.logprobs = logprobs
class Delta(OpenAIObject): class Delta(OpenAIObject):
def __init__(self, content="<special_litellm_token>", logprobs=None, role=None, **params): def __init__(self, content=None, logprobs=None, role=None, **params):
super(Delta, self).__init__(**params) super(Delta, self).__init__(**params)
if content != "<special_litellm_token>": if content is not None:
self.content = content self.content = content
if role: if role:
self.role = role self.role = role
@ -105,20 +107,35 @@ class Choices(OpenAIObject):
self.message = message self.message = message
class StreamingChoices(OpenAIObject): class StreamingChoices(OpenAIObject):
def __init__(self, finish_reason=None, index=0, delta=Delta(), **params): def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params):
super(StreamingChoices, self).__init__(**params) super(StreamingChoices, self).__init__(**params)
self.finish_reason = finish_reason self.finish_reason = finish_reason
self.index = index self.index = index
self.delta = delta if delta:
print(f"delta passed in: {delta}")
self.delta = delta
else:
self.delta = Delta()
class ModelResponse(OpenAIObject): class ModelResponse(OpenAIObject):
def __init__(self, choices=None, created=None, model=None, usage=None, stream=False, **params): def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, **params):
super(ModelResponse, self).__init__(**params)
if stream: if stream:
self.choices = self.choices = choices if choices else [StreamingChoices()] self.object = "chat.completion.chunk"
self.choices = [StreamingChoices()]
else: else:
if model in litellm.open_ai_embedding_models:
self.object = "embedding"
else:
self.object = "chat.completion"
self.choices = self.choices = choices if choices else [Choices()] self.choices = self.choices = choices if choices else [Choices()]
self.created = created if id is None:
self.id = _generate_id()
else:
self.id = id
if created is None:
self.created = int(time.time())
else:
self.created = created
self.model = model self.model = model
self.usage = ( self.usage = (
usage usage
@ -129,6 +146,7 @@ class ModelResponse(OpenAIObject):
"total_tokens": None, "total_tokens": None,
} }
) )
super(ModelResponse, self).__init__(**params)
def to_dict_recursive(self): def to_dict_recursive(self):
d = super().to_dict_recursive() d = super().to_dict_recursive()
@ -1041,8 +1059,10 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
# check if model in known model provider list # check if model in known model provider list
## openai - chatcompletion + text completion ## openai - chatcompletion + text completion
if model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_text_completion_models: if model in litellm.open_ai_chat_completion_models:
custom_llm_provider = "openai" custom_llm_provider = "openai"
elif model in litellm.open_ai_text_completion_models:
custom_llm_provider = "text-completion-openai"
## anthropic ## anthropic
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models:
custom_llm_provider = "anthropic" custom_llm_provider = "anthropic"
@ -2359,6 +2379,7 @@ class CustomStreamWrapper:
self.custom_llm_provider = custom_llm_provider self.custom_llm_provider = custom_llm_provider
self.logging_obj = logging_obj self.logging_obj = logging_obj
self.completion_stream = completion_stream self.completion_stream = completion_stream
self.sent_first_chunk = False
if self.logging_obj: if self.logging_obj:
# Log the type of the received item # Log the type of the received item
self.logging_obj.post_call(str(type(completion_stream))) self.logging_obj.post_call(str(type(completion_stream)))
@ -2413,7 +2434,6 @@ class CustomStreamWrapper:
chunk = chunk.decode("utf-8") chunk = chunk.decode("utf-8")
data_json = json.loads(chunk) data_json = json.loads(chunk)
try: try:
print(f"data json: {data_json}")
return data_json["generated_text"] return data_json["generated_text"]
except: except:
raise ValueError(f"Unable to parse response. Original response: {chunk}") raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2430,7 +2450,6 @@ class CustomStreamWrapper:
chunk = chunk.decode("utf-8") chunk = chunk.decode("utf-8")
data_json = json.loads(chunk) data_json = json.loads(chunk)
try: try:
print(f"data json: {data_json}")
return data_json["text"] return data_json["text"]
except: except:
raise ValueError(f"Unable to parse response. Original response: {chunk}") raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2485,8 +2504,12 @@ class CustomStreamWrapper:
return "" return ""
def __next__(self): def __next__(self):
model_response = ModelResponse(stream=True, model=self.model)
try: try:
# return this for all models # return this for all models
if self.sent_first_chunk == False:
model_response.choices[0].delta.role = "assistant"
self.sent_first_chunk = True
completion_obj = {"content": ""} # default to role being assistant completion_obj = {"content": ""} # default to role being assistant
if self.model in litellm.anthropic_models: if self.model in litellm.anthropic_models:
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
@ -2544,7 +2567,7 @@ class CustomStreamWrapper:
model_response.choices[0].delta = completion_obj model_response.choices[0].delta = completion_obj
model_response.model = self.model model_response.model = self.model
if model_response.choices[0].delta['content'] == "<special_litellm_token>": if model_response.choices[0].delta.content == "<special_litellm_token>":
model_response.choices[0].delta = { model_response.choices[0].delta = {
"content": completion_obj["content"], "content": completion_obj["content"],
} }
@ -2552,8 +2575,6 @@ class CustomStreamWrapper:
except StopIteration: except StopIteration:
raise StopIteration raise StopIteration
except Exception as e: except Exception as e:
print(e)
model_response = ModelResponse(stream=True)
model_response.choices[0].finish_reason = "stop" model_response.choices[0].finish_reason = "stop"
return model_response return model_response

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "0.1.675" version = "0.1.676"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT License" license = "MIT License"