ensure streaming format is exactly the same as openai

2025-04-25 18:54:30 +00:00 · 2023-09-16 10:34:20 -07:00 · 2023-09-16 10:34:20 -07:00 · 21cd55ab26
commit 21cd55ab26
parent ebd4688fec
6 changed files with 275 additions and 169 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/main.py
+++ b/litellm/main.py
@ -197,7 +197,7 @@ def completion(
            completion_call_id=id
        )
        logging.update_environment_variables(model=model, user=user, optional_params=optional_params, litellm_params=litellm_params)
-        get_llm_provider(model=model, custom_llm_provider=custom_llm_provider)
+        model, custom_llm_provider = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider)
        if custom_llm_provider == "azure":
            # azure configs
            api_type = get_secret("AZURE_API_TYPE") or "azure"
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -24,6 +24,170 @@ def logger_fn(model_call_object: dict):
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
 first_openai_chunk_example = {
    "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
    "object": "chat.completion.chunk",
    "created": 1694881253,
    "model": "gpt-4-0613",
    "choices": [
        {
            "index": 0,
            "delta": {
                "role": "assistant",
                "content": ""
            },
            "finish_reason": None # it's null
        }
    ]
 }
 def validate_first_format(chunk):
    # write a test to make sure chunk follows the same format as first_openai_chunk_example
    assert isinstance(chunk, dict), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "role" in choice['delta'], "'delta' should have a 'role'."
        assert isinstance(choice['delta']['role'], str), "'role' should be a string."
        assert "content" in choice['delta'], "'delta' should have 'content'."
        assert isinstance(choice['delta']['content'], str), "'content' should be a string."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 second_openai_chunk_example = {
    "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
    "object": "chat.completion.chunk",
    "created": 1694881253,
    "model": "gpt-4-0613",
    "choices": [
        {
            "index": 0,
            "delta": {
                "content": "Hello"
            },
            "finish_reason": None # it's null
        }
    ]
 }
 def validate_second_format(chunk):
    assert isinstance(chunk, dict), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "content" in choice['delta'], "'delta' should have 'content'."
        assert isinstance(choice['delta']['content'], str), "'content' should be a string."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 last_openai_chunk_example = {
    "id": "chatcmpl-7zSKLBVXnX9dwgRuDYVqVVDsgh2yp",
    "object": "chat.completion.chunk",
    "created": 1694881253,
    "model": "gpt-4-0613",
    "choices": [
        {
            "index": 0,
            "delta": {},
            "finish_reason": "stop"
        }
    ]
 }
 def validate_last_format(chunk):
    assert isinstance(chunk, dict), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string."
 def streaming_format_tests(idx, chunk):
    extracted_chunk = "" 
    finished = False
    if idx == 0: # ensure role assistant is set 
        validate_first_format(chunk=chunk)
        role = chunk["choices"][0]["delta"]["role"]
        assert role == "assistant"
    elif idx == 1: # second chunk 
        validate_second_format(chunk=chunk)
    if idx != 0: # ensure no role
        if "role" in chunk["choices"][0]["delta"]:
            raise Exception("role should not exist after first chunk")
    if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk
        validate_last_format(chunk=chunk)
        finished = True
    if "content" in chunk["choices"][0]["delta"]:
        extracted_chunk = chunk["choices"][0]["delta"]["content"]
    return extracted_chunk, finished
 def test_completion_cohere_stream():
    try:
        messages = [
@ -38,36 +202,18 @@ def test_completion_cohere_stream():
        )
        complete_response = ""
        # Add any assertions here to check the response
-        for chunk in response:
+        for idx, chunk in enumerate(response):
-            print(f"chunk: {chunk}")
+            chunk, finished = streaming_format_tests(idx, chunk)
-            complete_response += chunk["choices"][0]["delta"]["content"]
+            if finished:
-        if complete_response == "": 
+                break
            complete_response += chunk
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except KeyError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-# test on baseten completion call
+# test_completion_cohere_stream()
 # try:
 #     response = completion(
 #         model="baseten/RqgAEn0", messages=messages, logger_fn=logger_fn
 #     )
 #     print(f"response: {response}")
 #     complete_response = ""
 #     start_time = time.time()
 #     for chunk in response:
 #         chunk_time = time.time()
 #         print(f"time since initial request: {chunk_time - start_time:.5f}")
 #         print(chunk["choices"][0]["delta"])
 #         complete_response += chunk["choices"][0]["delta"]["content"]
 #     if complete_response == "": 
 #         raise Exception("Empty response received")
 #     print(f"complete response: {complete_response}")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # test on openai completion call
 def test_openai_text_completion_call():
@ -77,16 +223,17 @@ def test_openai_text_completion_call():
        )
        complete_response = ""
        start_time = time.time()
-        for chunk in response:
+        for idx, chunk in enumerate(response):
-            chunk_time = time.time()
+            chunk, finished = streaming_format_tests(idx, chunk)
-            print(f"chunk: {chunk}")
+            if finished:
-            if "content" in chunk["choices"][0]["delta"]:
+                break
-                complete_response += chunk["choices"][0]["delta"]["content"]
+            complete_response += chunk
-        if complete_response == "": 
+        if complete_response.strip() == "": 
            raise Exception("Empty response received")
    except:
-        print(f"error occurred: {traceback.format_exc()}")
+        pytest.fail(f"error occurred: {traceback.format_exc()}")
-        pass
+
 test_openai_text_completion_call()
 # # test on ai21 completion call
 def ai21_completion_call():
@ -97,18 +244,18 @@ def ai21_completion_call():
        print(f"response: {response}")
        complete_response = ""
        start_time = time.time()
-        for chunk in response:
+        for idx, chunk in enumerate(response):
-            chunk_time = time.time()
+            chunk, finished = streaming_format_tests(idx, chunk)
-            print(f"time since initial request: {chunk_time - start_time:.5f}")
+            if finished:
-            print(chunk)
+                break
-            if "content" in chunk["choices"][0]["delta"]:
+            complete_response += chunk
-                complete_response += chunk["choices"][0]["delta"]["content"]
+        if complete_response.strip() == "": 
        if complete_response == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except:
-        print(f"error occurred: {traceback.format_exc()}")
+        pytest.fail(f"error occurred: {traceback.format_exc()}")
        pass
 # ai21_completion_call()
 # test on openai completion call
 def test_openai_chat_completion_call():
    try:
@ -117,107 +264,20 @@ def test_openai_chat_completion_call():
        )
        complete_response = ""
        start_time = time.time()
-        for chunk in response:
+        for idx, chunk in enumerate(response):
-            print(chunk)
+            chunk, finished = streaming_format_tests(idx, chunk)
-            if chunk["choices"][0]["finish_reason"]:
+            if finished:
                break
-            # if chunk["choices"][0]["delta"]["role"] != "assistant":
+            complete_response += chunk
            #     raise Exception("invalid role")
            if "content" in chunk["choices"][0]["delta"]:
                complete_response += chunk["choices"][0]["delta"]["content"]
            # print(f'complete_chunk: {complete_response}')
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
-test_openai_chat_completion_call()
+# test_openai_chat_completion_call()
 async def completion_call():
    try:
        response = completion(
            model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
        )
        print(f"response: {response}")
        complete_response = ""
        start_time = time.time()
        # Change for loop to async for loop
        async for chunk in response:
            chunk_time = time.time()
            print(f"time since initial request: {chunk_time - start_time:.5f}")
            print(chunk["choices"][0]["delta"])
            if "content" in chunk["choices"][0]["delta"]:
                complete_response += chunk["choices"][0]["delta"]["content"]
        if complete_response == "": 
            raise Exception("Empty response received")
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # asyncio.run(completion_call())
 # # test on azure completion call
 # try:
 #     response = completion(
 #         model="azure/chatgpt-test", messages=messages, stream=True, logger_fn=logger_fn
 #     )
 #     response = ""
 #     start_time = time.time()
 #     for chunk in response:
 #         chunk_time = time.time()
 #         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
 #         response += chunk["choices"][0]["delta"]
 #     if response == "":
 #         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # # test on huggingface completion call
 # try:
 #     start_time = time.time()
 #     response = completion(
 #         model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
 #     )
 #     complete_response = ""
 #     for chunk in response:
 #         chunk_time = time.time()
 #         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
 #         complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else ""
 #     if complete_response == "":
 #         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # test on together ai completion call - replit-code-3b
 def test_together_ai_completion_call_replit():
    try:
        start_time = time.time()
        response = completion(
            model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream=True
        )
        complete_response = ""
        print(f"returned response object: {response}")
        for chunk in response:
            chunk_time = time.time()
            print(f"time since initial request: {chunk_time - start_time:.2f}")
            print(chunk["choices"][0]["delta"])
            complete_response += (
                chunk["choices"][0]["delta"]["content"]
                if len(chunk["choices"][0]["delta"].keys()) > 0
                else ""
            )
        if complete_response == "":
            raise Exception("Empty response received")
    except KeyError as e:
        pass
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # # test on together ai completion call - starcoder
 def test_together_ai_completion_call_starcoder():
@ -231,23 +291,18 @@ def test_together_ai_completion_call_starcoder():
        )
        complete_response = ""
        print(f"returned response object: {response}")
-        for chunk in response:
+        for idx, chunk in enumerate(response):
-            chunk_time = time.time()
+            chunk, finished = streaming_format_tests(idx, chunk)
-            complete_response += (
+            if finished:
-                chunk["choices"][0]["delta"]["content"]
+                break
-                if len(chunk["choices"][0]["delta"].keys()) > 0
+            complete_response += chunk
                else ""
            )
            if len(complete_response) > 0:
                print(complete_response)
        if complete_response == "":
            raise Exception("Empty response received")
-    except KeyError as e:
+        print(f"complete response: {complete_response}")
        pass
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
-
+# test_together_ai_completion_call_starcoder()
 # test on aleph alpha completion call - commented out as it's expensive to run this on circle ci for every build
 # def test_aleph_alpha_call():
 #     try:
@ -286,13 +341,43 @@ async def ai21_async_completion_call():
        complete_response = ""
        start_time = time.time()
        # Change for loop to async for loop
        idx = 0
        async for chunk in response:
-            chunk_time = time.time()
+            chunk, finished = streaming_format_tests(idx, chunk)
-            print(f"time since initial request: {chunk_time - start_time:.5f}")
+            if finished:
-            print(chunk["choices"][0]["delta"])
+                break
-            complete_response += chunk["choices"][0]["delta"]["content"]
+            complete_response += chunk
-        if complete_response == "": 
+            idx += 1
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # asyncio.run(ai21_async_completion_call())
 async def completion_call():
    try:
        response = completion(
            model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
        )
        print(f"response: {response}")
        complete_response = ""
        start_time = time.time()
        # Change for loop to async for loop
        idx = 0
        async for chunk in response:
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
                break
            complete_response += chunk
            idx += 1
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # asyncio.run(completion_call())
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -80,6 +80,8 @@ last_fetched_at_keys = None
 #  'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}
 # }
 def _generate_id(): # private helper function
    return 'chatcmpl-' + str(uuid.uuid4())
 class Message(OpenAIObject):
    def __init__(self, content="default", role="assistant", logprobs=None, **params):
@ -89,9 +91,9 @@ class Message(OpenAIObject):
        self.logprobs = logprobs
 class Delta(OpenAIObject):
-    def __init__(self, content="<special_litellm_token>", logprobs=None, role=None, **params):
+    def __init__(self, content=None, logprobs=None, role=None, **params):
        super(Delta, self).__init__(**params)
-        if content != "<special_litellm_token>":
+        if content is not None:
            self.content = content
        if role:
            self.role = role
@ -105,20 +107,35 @@ class Choices(OpenAIObject):
        self.message = message
 class StreamingChoices(OpenAIObject):
-    def __init__(self, finish_reason=None, index=0, delta=Delta(), **params):
+    def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params):
        super(StreamingChoices, self).__init__(**params)
        self.finish_reason = finish_reason
        self.index = index
-        self.delta = delta
+        if delta:
            print(f"delta passed in: {delta}")
            self.delta = delta
        else:
            self.delta = Delta()
 class ModelResponse(OpenAIObject):
-    def __init__(self, choices=None, created=None, model=None, usage=None, stream=False, **params):
+    def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, **params):
        super(ModelResponse, self).__init__(**params)
        if stream:
-            self.choices = self.choices = choices if choices else [StreamingChoices()]
+            self.object = "chat.completion.chunk"
            self.choices = [StreamingChoices()]
        else:
            if model in litellm.open_ai_embedding_models:
                self.object = "embedding"
            else:
                self.object = "chat.completion"
            self.choices = self.choices = choices if choices else [Choices()]
-        self.created = created
+        if id is None:
            self.id = _generate_id()
        else:
            self.id = id
        if created is None:
            self.created = int(time.time())
        else:
            self.created = created
        self.model = model
        self.usage = (
            usage
@ -129,6 +146,7 @@ class ModelResponse(OpenAIObject):
                "total_tokens": None,
            }
        )
        super(ModelResponse, self).__init__(**params)
    def to_dict_recursive(self):
        d = super().to_dict_recursive()
@ -1041,8 +1059,10 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
        # check if model in known model provider list 
        ## openai - chatcompletion + text completion
-        if model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_text_completion_models:
+        if model in litellm.open_ai_chat_completion_models:
            custom_llm_provider = "openai"
        elif model in litellm.open_ai_text_completion_models:
            custom_llm_provider = "text-completion-openai"
        ## anthropic 
        elif model in litellm.anthropic_models:
            custom_llm_provider = "anthropic"
@ -2359,6 +2379,7 @@ class CustomStreamWrapper:
        self.custom_llm_provider = custom_llm_provider
        self.logging_obj = logging_obj
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        if self.logging_obj:
                # Log the type of the received item
                self.logging_obj.post_call(str(type(completion_stream)))
@ -2413,7 +2434,6 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
            print(f"data json: {data_json}")
            return data_json["generated_text"]
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2430,7 +2450,6 @@ class CustomStreamWrapper:
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
        try:
            print(f"data json: {data_json}")
            return data_json["text"]
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")
@ -2485,8 +2504,12 @@ class CustomStreamWrapper:
        return ""
    def __next__(self):
        model_response = ModelResponse(stream=True, model=self.model)
        try:
            # return this for all models
            if self.sent_first_chunk == False:
                model_response.choices[0].delta.role = "assistant"
                self.sent_first_chunk = True
            completion_obj = {"content": ""} # default to role being assistant
            if self.model in litellm.anthropic_models:
                chunk = next(self.completion_stream)
@ -2544,7 +2567,7 @@ class CustomStreamWrapper:
            model_response.choices[0].delta = completion_obj
            model_response.model = self.model
-            if model_response.choices[0].delta['content'] == "<special_litellm_token>":
+            if model_response.choices[0].delta.content == "<special_litellm_token>":
                model_response.choices[0].delta = {
                    "content": completion_obj["content"],
                }
@ -2552,8 +2575,6 @@ class CustomStreamWrapper:
        except StopIteration:
            raise StopIteration
        except Exception as e:
            print(e)
            model_response = ModelResponse(stream=True)
            model_response.choices[0].finish_reason = "stop"
            return model_response
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.675"
+version = "0.1.676"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"