diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 48f542741..78da5f6ab 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index 7e2dcf510..37f9204e0 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/main.py b/litellm/main.py index 765c6ff83..06d938ac8 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -197,7 +197,7 @@ def completion( completion_call_id=id ) logging.update_environment_variables(model=model, user=user, optional_params=optional_params, litellm_params=litellm_params) - get_llm_provider(model=model, custom_llm_provider=custom_llm_provider) + model, custom_llm_provider = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider) if custom_llm_provider == "azure": # azure configs api_type = get_secret("AZURE_API_TYPE") or "azure" diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 6b1b2b9a1..eff0ddcb3 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -24,6 +24,170 @@ def logger_fn(model_call_object: dict): user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] + +first_openai_chunk_example = { + "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp", + "object": "chat.completion.chunk", + "created": 1694881253, + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": "" + }, + "finish_reason": None # it's null + } + ] +} + +def validate_first_format(chunk): + # write a test to make sure chunk follows the same format as first_openai_chunk_example + assert isinstance(chunk, dict), "Chunk should be a dictionary." + assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk['id'], str), "'id' should be a string." + + assert "object" in chunk, "Chunk should have an 'object'." + assert isinstance(chunk['object'], str), "'object' should be a string." + + assert "created" in chunk, "Chunk should have a 'created'." + assert isinstance(chunk['created'], int), "'created' should be an integer." + + assert "model" in chunk, "Chunk should have a 'model'." + assert isinstance(chunk['model'], str), "'model' should be a string." + + assert "choices" in chunk, "Chunk should have 'choices'." + assert isinstance(chunk['choices'], list), "'choices' should be a list." + + for choice in chunk['choices']: + assert isinstance(choice, dict), "Each choice should be a dictionary." + + assert "index" in choice, "Each choice should have 'index'." + assert isinstance(choice['index'], int), "'index' should be an integer." + + assert "delta" in choice, "Each choice should have 'delta'." + assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." + + assert "role" in choice['delta'], "'delta' should have a 'role'." + assert isinstance(choice['delta']['role'], str), "'role' should be a string." + + assert "content" in choice['delta'], "'delta' should have 'content'." + assert isinstance(choice['delta']['content'], str), "'content' should be a string." + + assert "finish_reason" in choice, "Each choice should have 'finish_reason'." + assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string." + +second_openai_chunk_example = { + "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp", + "object": "chat.completion.chunk", + "created": 1694881253, + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "delta": { + "content": "Hello" + }, + "finish_reason": None # it's null + } + ] +} + +def validate_second_format(chunk): + assert isinstance(chunk, dict), "Chunk should be a dictionary." + assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk['id'], str), "'id' should be a string." + + assert "object" in chunk, "Chunk should have an 'object'." + assert isinstance(chunk['object'], str), "'object' should be a string." + + assert "created" in chunk, "Chunk should have a 'created'." + assert isinstance(chunk['created'], int), "'created' should be an integer." + + assert "model" in chunk, "Chunk should have a 'model'." + assert isinstance(chunk['model'], str), "'model' should be a string." + + assert "choices" in chunk, "Chunk should have 'choices'." + assert isinstance(chunk['choices'], list), "'choices' should be a list." + + for choice in chunk['choices']: + assert isinstance(choice, dict), "Each choice should be a dictionary." + + assert "index" in choice, "Each choice should have 'index'." + assert isinstance(choice['index'], int), "'index' should be an integer." + + assert "delta" in choice, "Each choice should have 'delta'." + assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." + + assert "content" in choice['delta'], "'delta' should have 'content'." + assert isinstance(choice['delta']['content'], str), "'content' should be a string." + + assert "finish_reason" in choice, "Each choice should have 'finish_reason'." + assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string." + +last_openai_chunk_example = { + "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp", + "object": "chat.completion.chunk", + "created": 1694881253, + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": "stop" + } + ] +} + +def validate_last_format(chunk): + assert isinstance(chunk, dict), "Chunk should be a dictionary." + assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk['id'], str), "'id' should be a string." + + assert "object" in chunk, "Chunk should have an 'object'." + assert isinstance(chunk['object'], str), "'object' should be a string." + + assert "created" in chunk, "Chunk should have a 'created'." + assert isinstance(chunk['created'], int), "'created' should be an integer." + + assert "model" in chunk, "Chunk should have a 'model'." + assert isinstance(chunk['model'], str), "'model' should be a string." + + assert "choices" in chunk, "Chunk should have 'choices'." + assert isinstance(chunk['choices'], list), "'choices' should be a list." + + for choice in chunk['choices']: + assert isinstance(choice, dict), "Each choice should be a dictionary." + + assert "index" in choice, "Each choice should have 'index'." + assert isinstance(choice['index'], int), "'index' should be an integer." + + assert "delta" in choice, "Each choice should have 'delta'." + assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." + + assert "finish_reason" in choice, "Each choice should have 'finish_reason'." + assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string." + +def streaming_format_tests(idx, chunk): + extracted_chunk = "" + finished = False + if idx == 0: # ensure role assistant is set + validate_first_format(chunk=chunk) + role = chunk["choices"][0]["delta"]["role"] + assert role == "assistant" + elif idx == 1: # second chunk + validate_second_format(chunk=chunk) + if idx != 0: # ensure no role + if "role" in chunk["choices"][0]["delta"]: + raise Exception("role should not exist after first chunk") + if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk + validate_last_format(chunk=chunk) + finished = True + if "content" in chunk["choices"][0]["delta"]: + extracted_chunk = chunk["choices"][0]["delta"]["content"] + return extracted_chunk, finished + def test_completion_cohere_stream(): try: messages = [ @@ -38,36 +202,18 @@ def test_completion_cohere_stream(): ) complete_response = "" # Add any assertions here to check the response - for chunk in response: - print(f"chunk: {chunk}") - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response.strip() == "": raise Exception("Empty response received") print(f"completion_response: {complete_response}") - except KeyError as e: - pass except Exception as e: pytest.fail(f"Error occurred: {e}") -# test on baseten completion call -# try: -# response = completion( -# model="baseten/RqgAEn0", messages=messages, logger_fn=logger_fn -# ) -# print(f"response: {response}") -# complete_response = "" -# start_time = time.time() -# for chunk in response: -# chunk_time = time.time() -# print(f"time since initial request: {chunk_time - start_time:.5f}") -# print(chunk["choices"][0]["delta"]) -# complete_response += chunk["choices"][0]["delta"]["content"] -# if complete_response == "": -# raise Exception("Empty response received") -# print(f"complete response: {complete_response}") -# except: -# print(f"error occurred: {traceback.format_exc()}") -# pass +# test_completion_cohere_stream() # test on openai completion call def test_openai_text_completion_call(): @@ -77,16 +223,17 @@ def test_openai_text_completion_call(): ) complete_response = "" start_time = time.time() - for chunk in response: - chunk_time = time.time() - print(f"chunk: {chunk}") - if "content" in chunk["choices"][0]["delta"]: - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response.strip() == "": raise Exception("Empty response received") except: - print(f"error occurred: {traceback.format_exc()}") - pass + pytest.fail(f"error occurred: {traceback.format_exc()}") + +test_openai_text_completion_call() # # test on ai21 completion call def ai21_completion_call(): @@ -97,18 +244,18 @@ def ai21_completion_call(): print(f"response: {response}") complete_response = "" start_time = time.time() - for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.5f}") - print(chunk) - if "content" in chunk["choices"][0]["delta"]: - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response.strip() == "": raise Exception("Empty response received") + print(f"completion_response: {complete_response}") except: - print(f"error occurred: {traceback.format_exc()}") - pass + pytest.fail(f"error occurred: {traceback.format_exc()}") +# ai21_completion_call() # test on openai completion call def test_openai_chat_completion_call(): try: @@ -117,107 +264,20 @@ def test_openai_chat_completion_call(): ) complete_response = "" start_time = time.time() - for chunk in response: - print(chunk) - if chunk["choices"][0]["finish_reason"]: + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: break - # if chunk["choices"][0]["delta"]["role"] != "assistant": - # raise Exception("invalid role") - if "content" in chunk["choices"][0]["delta"]: - complete_response += chunk["choices"][0]["delta"]["content"] + complete_response += chunk # print(f'complete_chunk: {complete_response}') if complete_response.strip() == "": raise Exception("Empty response received") + print(f"complete response: {complete_response}") except: print(f"error occurred: {traceback.format_exc()}") pass -test_openai_chat_completion_call() -async def completion_call(): - try: - response = completion( - model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn - ) - print(f"response: {response}") - complete_response = "" - start_time = time.time() - # Change for loop to async for loop - async for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.5f}") - print(chunk["choices"][0]["delta"]) - if "content" in chunk["choices"][0]["delta"]: - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": - raise Exception("Empty response received") - except: - print(f"error occurred: {traceback.format_exc()}") - pass - -# asyncio.run(completion_call()) - -# # test on azure completion call -# try: -# response = completion( -# model="azure/chatgpt-test", messages=messages, stream=True, logger_fn=logger_fn -# ) -# response = "" -# start_time = time.time() -# for chunk in response: -# chunk_time = time.time() -# print(f"time since initial request: {chunk_time - start_time:.2f}") -# print(chunk["choices"][0]["delta"]) -# response += chunk["choices"][0]["delta"] -# if response == "": -# raise Exception("Empty response received") -# except: -# print(f"error occurred: {traceback.format_exc()}") -# pass - - -# # test on huggingface completion call -# try: -# start_time = time.time() -# response = completion( -# model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn -# ) -# complete_response = "" -# for chunk in response: -# chunk_time = time.time() -# print(f"time since initial request: {chunk_time - start_time:.2f}") -# print(chunk["choices"][0]["delta"]) -# complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else "" -# if complete_response == "": -# raise Exception("Empty response received") -# except: -# print(f"error occurred: {traceback.format_exc()}") -# pass - -# test on together ai completion call - replit-code-3b -def test_together_ai_completion_call_replit(): - try: - start_time = time.time() - response = completion( - model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream=True - ) - complete_response = "" - print(f"returned response object: {response}") - for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.2f}") - print(chunk["choices"][0]["delta"]) - complete_response += ( - chunk["choices"][0]["delta"]["content"] - if len(chunk["choices"][0]["delta"].keys()) > 0 - else "" - ) - if complete_response == "": - raise Exception("Empty response received") - except KeyError as e: - pass - except: - print(f"error occurred: {traceback.format_exc()}") - pass +# test_openai_chat_completion_call() # # test on together ai completion call - starcoder def test_together_ai_completion_call_starcoder(): @@ -231,23 +291,18 @@ def test_together_ai_completion_call_starcoder(): ) complete_response = "" print(f"returned response object: {response}") - for chunk in response: - chunk_time = time.time() - complete_response += ( - chunk["choices"][0]["delta"]["content"] - if len(chunk["choices"][0]["delta"].keys()) > 0 - else "" - ) - if len(complete_response) > 0: - print(complete_response) + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk if complete_response == "": raise Exception("Empty response received") - except KeyError as e: - pass + print(f"complete response: {complete_response}") except: print(f"error occurred: {traceback.format_exc()}") pass - +# test_together_ai_completion_call_starcoder() # test on aleph alpha completion call - commented out as it's expensive to run this on circle ci for every build # def test_aleph_alpha_call(): # try: @@ -286,13 +341,43 @@ async def ai21_async_completion_call(): complete_response = "" start_time = time.time() # Change for loop to async for loop + idx = 0 async for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.5f}") - print(chunk["choices"][0]["delta"]) - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + idx += 1 + if complete_response.strip() == "": raise Exception("Empty response received") + print(f"complete response: {complete_response}") except: print(f"error occurred: {traceback.format_exc()}") - pass \ No newline at end of file + pass + +# asyncio.run(ai21_async_completion_call()) + +async def completion_call(): + try: + response = completion( + model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn + ) + print(f"response: {response}") + complete_response = "" + start_time = time.time() + # Change for loop to async for loop + idx = 0 + async for chunk in response: + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + idx += 1 + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"complete response: {complete_response}") + except: + print(f"error occurred: {traceback.format_exc()}") + pass + +# asyncio.run(completion_call()) diff --git a/litellm/utils.py b/litellm/utils.py index dcf41cbce..b52136035 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -80,6 +80,8 @@ last_fetched_at_keys = None # 'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41} # } +def _generate_id(): # private helper function + return 'chatcmpl-' + str(uuid.uuid4()) class Message(OpenAIObject): def __init__(self, content="default", role="assistant", logprobs=None, **params): @@ -89,9 +91,9 @@ class Message(OpenAIObject): self.logprobs = logprobs class Delta(OpenAIObject): - def __init__(self, content="", logprobs=None, role=None, **params): + def __init__(self, content=None, logprobs=None, role=None, **params): super(Delta, self).__init__(**params) - if content != "": + if content is not None: self.content = content if role: self.role = role @@ -105,20 +107,35 @@ class Choices(OpenAIObject): self.message = message class StreamingChoices(OpenAIObject): - def __init__(self, finish_reason=None, index=0, delta=Delta(), **params): + def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params): super(StreamingChoices, self).__init__(**params) self.finish_reason = finish_reason self.index = index - self.delta = delta + if delta: + print(f"delta passed in: {delta}") + self.delta = delta + else: + self.delta = Delta() class ModelResponse(OpenAIObject): - def __init__(self, choices=None, created=None, model=None, usage=None, stream=False, **params): - super(ModelResponse, self).__init__(**params) + def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, **params): if stream: - self.choices = self.choices = choices if choices else [StreamingChoices()] + self.object = "chat.completion.chunk" + self.choices = [StreamingChoices()] else: + if model in litellm.open_ai_embedding_models: + self.object = "embedding" + else: + self.object = "chat.completion" self.choices = self.choices = choices if choices else [Choices()] - self.created = created + if id is None: + self.id = _generate_id() + else: + self.id = id + if created is None: + self.created = int(time.time()) + else: + self.created = created self.model = model self.usage = ( usage @@ -129,6 +146,7 @@ class ModelResponse(OpenAIObject): "total_tokens": None, } ) + super(ModelResponse, self).__init__(**params) def to_dict_recursive(self): d = super().to_dict_recursive() @@ -1041,8 +1059,10 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None): # check if model in known model provider list ## openai - chatcompletion + text completion - if model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_text_completion_models: + if model in litellm.open_ai_chat_completion_models: custom_llm_provider = "openai" + elif model in litellm.open_ai_text_completion_models: + custom_llm_provider = "text-completion-openai" ## anthropic elif model in litellm.anthropic_models: custom_llm_provider = "anthropic" @@ -2359,6 +2379,7 @@ class CustomStreamWrapper: self.custom_llm_provider = custom_llm_provider self.logging_obj = logging_obj self.completion_stream = completion_stream + self.sent_first_chunk = False if self.logging_obj: # Log the type of the received item self.logging_obj.post_call(str(type(completion_stream))) @@ -2413,7 +2434,6 @@ class CustomStreamWrapper: chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - print(f"data json: {data_json}") return data_json["generated_text"] except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2430,7 +2450,6 @@ class CustomStreamWrapper: chunk = chunk.decode("utf-8") data_json = json.loads(chunk) try: - print(f"data json: {data_json}") return data_json["text"] except: raise ValueError(f"Unable to parse response. Original response: {chunk}") @@ -2485,8 +2504,12 @@ class CustomStreamWrapper: return "" def __next__(self): + model_response = ModelResponse(stream=True, model=self.model) try: # return this for all models + if self.sent_first_chunk == False: + model_response.choices[0].delta.role = "assistant" + self.sent_first_chunk = True completion_obj = {"content": ""} # default to role being assistant if self.model in litellm.anthropic_models: chunk = next(self.completion_stream) @@ -2544,7 +2567,7 @@ class CustomStreamWrapper: model_response.choices[0].delta = completion_obj model_response.model = self.model - if model_response.choices[0].delta['content'] == "": + if model_response.choices[0].delta.content == "": model_response.choices[0].delta = { "content": completion_obj["content"], } @@ -2552,8 +2575,6 @@ class CustomStreamWrapper: except StopIteration: raise StopIteration except Exception as e: - print(e) - model_response = ModelResponse(stream=True) model_response.choices[0].finish_reason = "stop" return model_response diff --git a/pyproject.toml b/pyproject.toml index fb0e5ae29..768235679 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.675" +version = "0.1.676" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"