(refactor) use helper function _assemble_complete_response_from_streaming_chunks to assemble complete responses in caching and logging callbacks (#6220)

* (refactor) use _assemble_complete_response_from_streaming_chunks * add unit test for test_assemble_complete_response_from_streaming_chunks_1 * fix assemble complete_streaming_response * config add logging_testing * add logging_coverage in codecov * test test_assemble_complete_response_from_streaming_chunks_3 * add unit tests for _assemble_complete_response_from_streaming_chunks * fix remove unused / junk function * add test for streaming_chunks when error assembling
2024-10-15 12:45:12 +05:30 · 2024-10-15 12:45:12 +05:30 · a69c670baa
commit a69c670baa
parent e9a46b992c
9 changed files with 571 additions and 90 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -328,6 +328,48 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
  logging_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
      - checkout
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
            pip install "pytest-cov==5.0.0"
            pip install "pytest-asyncio==0.21.1"
            pip install "respx==0.21.1"
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
          command: |
            pwd
            ls
            python -m pytest -vv tests/logging_callback_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      - run:
          name: Rename the coverage files
          command: |
            mv coverage.xml logging_coverage.xml
            mv .coverage logging_coverage
      # Store test results
      - store_test_results:
          path: test-results
      - persist_to_workspace:
          root: .
          paths:
            - logging_coverage.xml
            - logging_coverage
  installing_litellm_on_python:
    docker:
      - image: circleci/python:3.8
@ -769,7 +811,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage ui_endpoint_testing_coverage
+            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage ui_endpoint_testing_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -1005,9 +1047,16 @@ workflows:
              only:
                - main
                - /litellm_.*/
      - logging_testing:
          filters:
            branches:
              only:
                - main
                - /litellm_.*/
      - upload-coverage:
          requires:
            - llm_translation_testing
            - logging_testing
            - litellm_router_testing
            - local_testing
            - litellm_assistants_api_testing
@ -1036,6 +1085,7 @@ workflows:
            - build_and_test
            - load_testing
            - llm_translation_testing
            - logging_testing
            - litellm_router_testing
            - litellm_assistants_api_testing
            - ui_endpoint_testing
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -26,6 +26,9 @@ from litellm.caching.caching import (
    RedisSemanticCache,
    S3Cache,
 )
 from litellm.litellm_core_utils.logging_utils import (
    _assemble_complete_response_from_streaming_chunks,
 )
 from litellm.types.rerank import RerankResponse
 from litellm.types.utils import (
    CallTypes,
@ -517,28 +520,14 @@ class LLMCachingHandler:
        """
        complete_streaming_response: Optional[
            Union[ModelResponse, TextCompletionResponse]
-        ] = None
+        ] = _assemble_complete_response_from_streaming_chunks(
-        if (
+            result=processed_chunk,
            processed_chunk.choices[0].finish_reason is not None
        ):  # if it's the last chunk
            self.async_streaming_chunks.append(processed_chunk)
            try:
                end_time: datetime.datetime = datetime.datetime.now()
                complete_streaming_response = litellm.stream_chunk_builder(
                    self.async_streaming_chunks,
                    messages=self.request_kwargs.get("messages", None),
            start_time=self.start_time,
-                    end_time=end_time,
+            end_time=datetime.datetime.now(),
            request_kwargs=self.request_kwargs,
            streaming_chunks=self.async_streaming_chunks,
            is_async=True,
        )
            except Exception as e:
                verbose_logger.exception(
                    "Error occurred building stream chunk in success logging: {}".format(
                        str(e)
                    )
                )
                complete_streaming_response = None
        else:
            self.async_streaming_chunks.append(processed_chunk)
        # if a complete_streaming_response is assembled, add it to the cache
        if complete_streaming_response is not None:
--- a/litellm/integrations/gcs_bucket/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket.py
@ -12,9 +12,6 @@ import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_dict,
 )
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 from litellm.proxy._types import CommonProxyErrors, SpendLogsMetadata, SpendLogsPayload
 from litellm.types.utils import (
--- a/litellm/integrations/gcs_bucket/gcs_bucket_base.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket_base.py
@ -10,9 +10,6 @@ from pydantic import BaseModel, Field
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_dict,
 )
 from litellm.llms.custom_httpx.http_handler import (
    get_async_httpx_client,
    httpxSpecialProvider,
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -86,6 +86,7 @@ from ..integrations.supabase import Supabase
 from ..integrations.traceloop import TraceloopLogger
 from ..integrations.weights_biases import WeightsBiasesLogger
 from .exception_mapping_utils import _get_response_headers
 from .logging_utils import _assemble_complete_response_from_streaming_chunks
 try:
    from ..proxy.enterprise.enterprise_callbacks.generic_api_callback import (
@ -878,32 +879,24 @@ class Logging:
        # print(f"original response in success handler: {self.model_call_details['original_response']}")
        try:
            verbose_logger.debug(f"success callbacks: {litellm.success_callback}")
            ## BUILD COMPLETE STREAMED RESPONSE
-            complete_streaming_response = None
+            complete_streaming_response: Optional[
                Union[ModelResponse, TextCompletionResponse]
            ] = None
            if "complete_streaming_response" in self.model_call_details:
                return  # break out of this.
-            if self.stream and isinstance(result, ModelResponse):
+            if self.stream:
-                if (
+                complete_streaming_response: Optional[
-                    result.choices[0].finish_reason is not None
+                    Union[ModelResponse, TextCompletionResponse]
-                ):  # if it's the last chunk
+                ] = _assemble_complete_response_from_streaming_chunks(
-                    self.sync_streaming_chunks.append(result)
+                    result=result,
                    # print_verbose(f"final set of received chunks: {self.sync_streaming_chunks}")
                    try:
                        complete_streaming_response = litellm.stream_chunk_builder(
                            self.sync_streaming_chunks,
                            messages=self.model_call_details.get("messages", None),
                    start_time=start_time,
                    end_time=end_time,
                    request_kwargs=self.model_call_details,
                    streaming_chunks=self.sync_streaming_chunks,
                    is_async=False,
                )
                    except Exception as e:
                        verbose_logger.exception(
                            "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while building complete streaming response in success logging {}".format(
                                str(e)
                            )
                        )
                        complete_streaming_response = None
                else:
                    self.sync_streaming_chunks.append(result)
            _caching_complete_streaming_response: Optional[
                Union[ModelResponse, TextCompletionResponse]
            ] = None
@ -1495,29 +1488,23 @@ class Logging:
            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
        )
        ## BUILD COMPLETE STREAMED RESPONSE
        complete_streaming_response = None
        if "async_complete_streaming_response" in self.model_call_details:
            return  # break out of this.
-        if self.stream:
+        complete_streaming_response: Optional[
-            if result.choices[0].finish_reason is not None:  # if it's the last chunk
+            Union[ModelResponse, TextCompletionResponse]
-                self.streaming_chunks.append(result)
+        ] = None
-                # verbose_logger.debug(f"final set of received chunks: {self.streaming_chunks}")
+        if self.stream is True:
-                try:
+            complete_streaming_response: Optional[
-                    complete_streaming_response = litellm.stream_chunk_builder(
+                Union[ModelResponse, TextCompletionResponse]
-                        self.streaming_chunks,
+            ] = _assemble_complete_response_from_streaming_chunks(
-                        messages=self.model_call_details.get("messages", None),
+                result=result,
                start_time=start_time,
                end_time=end_time,
                request_kwargs=self.model_call_details,
                streaming_chunks=self.streaming_chunks,
                is_async=True,
            )
-                except Exception as e:
+
                    verbose_logger.exception(
                        "Error occurred building stream chunk in success logging: {}".format(
                            str(e)
                        )
                    )
                    complete_streaming_response = None
            else:
                self.streaming_chunks.append(result)
        if complete_streaming_response is not None:
            print_verbose("Async success callbacks: Got a complete streaming response")
--- a/litellm/litellm_core_utils/logging_utils.py
+++ b/litellm/litellm_core_utils/logging_utils.py
@ -1,4 +1,8 @@
-from typing import TYPE_CHECKING, Any, Optional, Union
+from datetime import datetime
 from typing import TYPE_CHECKING, Any, List, Optional, Union
 from litellm._logging import verbose_logger
 from litellm.types.utils import ModelResponse, TextCompletionResponse
 if TYPE_CHECKING:
    from litellm import ModelResponse as _ModelResponse
@ -15,21 +19,6 @@ Helper utils used for logging callbacks
 """
 def convert_litellm_response_object_to_dict(response_obj: Any) -> dict:
    """
    Convert a LiteLLM response object to a dictionary
    """
    if isinstance(response_obj, dict):
        return response_obj
    for _type in litellm.ALL_LITELLM_RESPONSE_TYPES:
        if isinstance(response_obj, _type):
            return response_obj.model_dump()
    # If it's not a LiteLLM type, return the object as is
    return dict(response_obj)
 def convert_litellm_response_object_to_str(
    response_obj: Union[Any, LiteLLMModelResponse]
 ) -> Optional[str]:
@ -46,3 +35,55 @@ def convert_litellm_response_object_to_str(
        return response_str
    return None
 def _assemble_complete_response_from_streaming_chunks(
    result: Union[ModelResponse, TextCompletionResponse],
    start_time: datetime,
    end_time: datetime,
    request_kwargs: dict,
    streaming_chunks: List[Any],
    is_async: bool,
 ):
    """
    Assemble a complete response from a streaming chunks
    - assemble a complete streaming response if result.choices[0].finish_reason is not None
    - else append the chunk to the streaming_chunks
    Args:
        result: ModelResponse
        start_time: datetime
        end_time: datetime
        request_kwargs: dict
        streaming_chunks: List[Any]
        is_async: bool
    Returns:
        Optional[Union[ModelResponse, TextCompletionResponse]]: Complete streaming response
    """
    complete_streaming_response: Optional[
        Union[ModelResponse, TextCompletionResponse]
    ] = None
    if result.choices[0].finish_reason is not None:  # if it's the last chunk
        streaming_chunks.append(result)
        try:
            complete_streaming_response = litellm.stream_chunk_builder(
                chunks=streaming_chunks,
                messages=request_kwargs.get("messages", None),
                start_time=start_time,
                end_time=end_time,
            )
        except Exception as e:
            log_message = (
                "Error occurred building stream chunk in {} success logging: {}".format(
                    "async" if is_async else "sync", str(e)
                )
            )
            verbose_logger.exception(log_message)
            complete_streaming_response = None
    else:
        streaming_chunks.append(result)
    return complete_streaming_response
--- a/tests/local_testing/test_custom_callback_input.py
+++ b/tests/local_testing/test_custom_callback_input.py
@ -280,6 +280,9 @@ class CompletionCustomHandler(
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            print(
                "in async_log_success_event", kwargs, response_obj, start_time, end_time
            )
            self.states.append("async_success")
            ## START TIME
            assert isinstance(start_time, datetime)
@ -522,6 +525,7 @@ async def test_async_chat_azure_stream():
@pytest.mark.asyncio
 async def test_async_chat_openai_stream_options():
    try:
        litellm.set_verbose = True
        customHandler = CompletionCustomHandler()
        litellm.callbacks = [customHandler]
        with patch.object(
@ -536,7 +540,7 @@ async def test_async_chat_openai_stream_options():
            async for chunk in response:
                continue
-
+            print("mock client args list=", mock_client.await_args_list)
            mock_client.assert_awaited_once()
    except Exception as e:
        pytest.fail(f"An exception occurred: {str(e)}")
--- a/tests/logging_callback_tests/conftest.py
+++ b/tests/logging_callback_tests/conftest.py
@ -0,0 +1,54 @@
 # conftest.py
 import importlib
 import os
 import sys
 import pytest
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
@pytest.fixture(scope="function", autouse=True)
 def setup_and_teardown():
    """
    This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
    """
    curr_dir = os.getcwd()  # Get the current working directory
    sys.path.insert(
        0, os.path.abspath("../..")
    )  # Adds the project directory to the system path
    import litellm
    from litellm import Router
    importlib.reload(litellm)
    import asyncio
    loop = asyncio.get_event_loop_policy().new_event_loop()
    asyncio.set_event_loop(loop)
    print(litellm)
    # from litellm import Router, completion, aembedding, acompletion, embedding
    yield
    # Teardown code (executes after the yield point)
    loop.close()  # Close the loop created earlier
    asyncio.set_event_loop(None)  # Remove the reference to the loop
 def pytest_collection_modifyitems(config, items):
    # Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
    custom_logger_tests = [
        item for item in items if "custom_logger" in item.parent.name
    ]
    other_tests = [item for item in items if "custom_logger" not in item.parent.name]
    # Sort tests based on their names
    custom_logger_tests.sort(key=lambda x: x.name)
    other_tests.sort(key=lambda x: x.name)
    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
--- a/tests/logging_callback_tests/test_assemble_streaming_responses.py
+++ b/tests/logging_callback_tests/test_assemble_streaming_responses.py
@ -0,0 +1,362 @@
 """
 Testing for _assemble_complete_response_from_streaming_chunks
 - Test 1 - ModelResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
 - Test 2 - TextCompletionResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
 - Test 3 - Have multiple lists of streaming chunks, Assert that chunks are added to the correct list and that complete_streaming_response is None. After final chunk sent assert complete_streaming_response is not None
 - Test 4 - build a complete response when 1 chunk is poorly formatted
 """
 import json
 import os
 import sys
 from datetime import datetime
 from unittest.mock import AsyncMock
 from pydantic.main import Model
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import httpx
 import pytest
 from respx import MockRouter
 import litellm
 from litellm import Choices, Message, ModelResponse, TextCompletionResponse, TextChoices
 from litellm.litellm_core_utils.litellm_logging import (
    _assemble_complete_response_from_streaming_chunks,
 )
@pytest.mark.parametrize("is_async", [True, False])
 def test_assemble_complete_response_from_streaming_chunks_1(is_async):
    """
    Test 1 - ModelResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
    """
    request_kwargs = {
        "model": "test_model",
        "messages": [{"role": "user", "content": "Hello, world!"}],
    }
    list_streaming_chunks = []
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                delta=litellm.utils.Delta(
                    content="hello in response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks,
        is_async=is_async,
    )
    # this is the 1st chunk - complete_streaming_response should be None
    print("list_streaming_chunks", list_streaming_chunks)
    print("complete_streaming_response", complete_streaming_response)
    assert complete_streaming_response is None
    assert len(list_streaming_chunks) == 1
    assert list_streaming_chunks[0] == chunk
    # Add final chunk
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                finish_reason="stop",
                delta=litellm.utils.Delta(
                    content="end of response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks,
        is_async=is_async,
    )
    print("list_streaming_chunks", list_streaming_chunks)
    print("complete_streaming_response", complete_streaming_response)
    # this is the 2nd chunk - complete_streaming_response should not be None
    assert complete_streaming_response is not None
    assert len(list_streaming_chunks) == 2
    assert isinstance(complete_streaming_response, ModelResponse)
    assert isinstance(complete_streaming_response.choices[0], Choices)
    pass
@pytest.mark.parametrize("is_async", [True, False])
 def test_assemble_complete_response_from_streaming_chunks_2(is_async):
    """
    Test 2 - TextCompletionResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
    """
    from litellm.utils import TextCompletionStreamWrapper
    _text_completion_stream_wrapper = TextCompletionStreamWrapper(
        completion_stream=None, model="test_model"
    )
    request_kwargs = {
        "model": "test_model",
        "messages": [{"role": "user", "content": "Hello, world!"}],
    }
    list_streaming_chunks = []
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                delta=litellm.utils.Delta(
                    content="hello in response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    chunk = _text_completion_stream_wrapper.convert_to_text_completion_object(chunk)
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks,
        is_async=is_async,
    )
    # this is the 1st chunk - complete_streaming_response should be None
    print("list_streaming_chunks", list_streaming_chunks)
    print("complete_streaming_response", complete_streaming_response)
    assert complete_streaming_response is None
    assert len(list_streaming_chunks) == 1
    assert list_streaming_chunks[0] == chunk
    # Add final chunk
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                finish_reason="stop",
                delta=litellm.utils.Delta(
                    content="end of response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    chunk = _text_completion_stream_wrapper.convert_to_text_completion_object(chunk)
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks,
        is_async=is_async,
    )
    print("list_streaming_chunks", list_streaming_chunks)
    print("complete_streaming_response", complete_streaming_response)
    # this is the 2nd chunk - complete_streaming_response should not be None
    assert complete_streaming_response is not None
    assert len(list_streaming_chunks) == 2
    assert isinstance(complete_streaming_response, TextCompletionResponse)
    assert isinstance(complete_streaming_response.choices[0], TextChoices)
    pass
@pytest.mark.parametrize("is_async", [True, False])
 def test_assemble_complete_response_from_streaming_chunks_3(is_async):
    request_kwargs = {
        "model": "test_model",
        "messages": [{"role": "user", "content": "Hello, world!"}],
    }
    list_streaming_chunks_1 = []
    list_streaming_chunks_2 = []
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                delta=litellm.utils.Delta(
                    content="hello in response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks_1,
        is_async=is_async,
    )
    # this is the 1st chunk - complete_streaming_response should be None
    print("list_streaming_chunks_1", list_streaming_chunks_1)
    print("complete_streaming_response", complete_streaming_response)
    assert complete_streaming_response is None
    assert len(list_streaming_chunks_1) == 1
    assert list_streaming_chunks_1[0] == chunk
    assert len(list_streaming_chunks_2) == 0
    # now add a chunk to the 2nd list
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks_2,
        is_async=is_async,
    )
    print("list_streaming_chunks_2", list_streaming_chunks_2)
    print("complete_streaming_response", complete_streaming_response)
    assert complete_streaming_response is None
    assert len(list_streaming_chunks_2) == 1
    assert list_streaming_chunks_2[0] == chunk
    assert len(list_streaming_chunks_1) == 1
    # now add a chunk to the 1st list
@pytest.mark.parametrize("is_async", [True, False])
 def test_assemble_complete_response_from_streaming_chunks_4(is_async):
    """
    Test 4 - build a complete response when 1 chunk is poorly formatted
    - Assert complete_streaming_response is None
    - Assert list_streaming_chunks is not empty
    """
    request_kwargs = {
        "model": "test_model",
        "messages": [{"role": "user", "content": "Hello, world!"}],
    }
    list_streaming_chunks = []
    chunk = {
        "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
        "choices": [
            litellm.utils.StreamingChoices(
                finish_reason="stop",
                delta=litellm.utils.Delta(
                    content="end of response",
                    function_call=None,
                    role=None,
                    tool_calls=None,
                ),
                index=0,
                logprobs=None,
            )
        ],
        "created": 1721353246,
        "model": "gpt-3.5-turbo",
        "object": "chat.completion.chunk",
        "system_fingerprint": None,
        "usage": None,
    }
    chunk = litellm.ModelResponse(**chunk, stream=True)
    # remove attribute id from chunk
    del chunk.id
    complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
        result=chunk,
        start_time=datetime.now(),
        end_time=datetime.now(),
        request_kwargs=request_kwargs,
        streaming_chunks=list_streaming_chunks,
        is_async=is_async,
    )
    print("complete_streaming_response", complete_streaming_response)
    assert complete_streaming_response is None
    print("list_streaming_chunks", list_streaming_chunks)
    assert len(list_streaming_chunks) == 1