(refactor) use helper function _assemble_complete_response_from_streaming_chunks to assemble complete responses in caching and logging callbacks (#6220)

* (refactor) use _assemble_complete_response_from_streaming_chunks

* add unit test for test_assemble_complete_response_from_streaming_chunks_1

* fix assemble complete_streaming_response

* config add logging_testing

* add logging_coverage in codecov

* test test_assemble_complete_response_from_streaming_chunks_3

* add unit tests for _assemble_complete_response_from_streaming_chunks

* fix remove unused / junk function

* add test for streaming_chunks when error assembling
This commit is contained in:
Ishaan Jaff 2024-10-15 12:45:12 +05:30 committed by GitHub
parent e9a46b992c
commit a69c670baa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 571 additions and 90 deletions

View file

@ -280,6 +280,9 @@ class CompletionCustomHandler(
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
print(
"in async_log_success_event", kwargs, response_obj, start_time, end_time
)
self.states.append("async_success")
## START TIME
assert isinstance(start_time, datetime)
@ -522,6 +525,7 @@ async def test_async_chat_azure_stream():
@pytest.mark.asyncio
async def test_async_chat_openai_stream_options():
try:
litellm.set_verbose = True
customHandler = CompletionCustomHandler()
litellm.callbacks = [customHandler]
with patch.object(
@ -536,7 +540,7 @@ async def test_async_chat_openai_stream_options():
async for chunk in response:
continue
print("mock client args list=", mock_client.await_args_list)
mock_client.assert_awaited_once()
except Exception as e:
pytest.fail(f"An exception occurred: {str(e)}")

View file

@ -0,0 +1,54 @@
# conftest.py
import importlib
import os
import sys
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
@pytest.fixture(scope="function", autouse=True)
def setup_and_teardown():
"""
This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
"""
curr_dir = os.getcwd() # Get the current working directory
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the project directory to the system path
import litellm
from litellm import Router
importlib.reload(litellm)
import asyncio
loop = asyncio.get_event_loop_policy().new_event_loop()
asyncio.set_event_loop(loop)
print(litellm)
# from litellm import Router, completion, aembedding, acompletion, embedding
yield
# Teardown code (executes after the yield point)
loop.close() # Close the loop created earlier
asyncio.set_event_loop(None) # Remove the reference to the loop
def pytest_collection_modifyitems(config, items):
# Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
custom_logger_tests = [
item for item in items if "custom_logger" in item.parent.name
]
other_tests = [item for item in items if "custom_logger" not in item.parent.name]
# Sort tests based on their names
custom_logger_tests.sort(key=lambda x: x.name)
other_tests.sort(key=lambda x: x.name)
# Reorder the items list
items[:] = custom_logger_tests + other_tests

View file

@ -0,0 +1,362 @@
"""
Testing for _assemble_complete_response_from_streaming_chunks
- Test 1 - ModelResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
- Test 2 - TextCompletionResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
- Test 3 - Have multiple lists of streaming chunks, Assert that chunks are added to the correct list and that complete_streaming_response is None. After final chunk sent assert complete_streaming_response is not None
- Test 4 - build a complete response when 1 chunk is poorly formatted
"""
import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
from pydantic.main import Model
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse, TextCompletionResponse, TextChoices
from litellm.litellm_core_utils.litellm_logging import (
_assemble_complete_response_from_streaming_chunks,
)
@pytest.mark.parametrize("is_async", [True, False])
def test_assemble_complete_response_from_streaming_chunks_1(is_async):
"""
Test 1 - ModelResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
"""
request_kwargs = {
"model": "test_model",
"messages": [{"role": "user", "content": "Hello, world!"}],
}
list_streaming_chunks = []
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
delta=litellm.utils.Delta(
content="hello in response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks,
is_async=is_async,
)
# this is the 1st chunk - complete_streaming_response should be None
print("list_streaming_chunks", list_streaming_chunks)
print("complete_streaming_response", complete_streaming_response)
assert complete_streaming_response is None
assert len(list_streaming_chunks) == 1
assert list_streaming_chunks[0] == chunk
# Add final chunk
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
finish_reason="stop",
delta=litellm.utils.Delta(
content="end of response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks,
is_async=is_async,
)
print("list_streaming_chunks", list_streaming_chunks)
print("complete_streaming_response", complete_streaming_response)
# this is the 2nd chunk - complete_streaming_response should not be None
assert complete_streaming_response is not None
assert len(list_streaming_chunks) == 2
assert isinstance(complete_streaming_response, ModelResponse)
assert isinstance(complete_streaming_response.choices[0], Choices)
pass
@pytest.mark.parametrize("is_async", [True, False])
def test_assemble_complete_response_from_streaming_chunks_2(is_async):
"""
Test 2 - TextCompletionResponse with 1 list of streaming chunks. Assert chunks are added to the streaming_chunks, after final chunk sent assert complete_streaming_response is not None
"""
from litellm.utils import TextCompletionStreamWrapper
_text_completion_stream_wrapper = TextCompletionStreamWrapper(
completion_stream=None, model="test_model"
)
request_kwargs = {
"model": "test_model",
"messages": [{"role": "user", "content": "Hello, world!"}],
}
list_streaming_chunks = []
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
delta=litellm.utils.Delta(
content="hello in response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
chunk = _text_completion_stream_wrapper.convert_to_text_completion_object(chunk)
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks,
is_async=is_async,
)
# this is the 1st chunk - complete_streaming_response should be None
print("list_streaming_chunks", list_streaming_chunks)
print("complete_streaming_response", complete_streaming_response)
assert complete_streaming_response is None
assert len(list_streaming_chunks) == 1
assert list_streaming_chunks[0] == chunk
# Add final chunk
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
finish_reason="stop",
delta=litellm.utils.Delta(
content="end of response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
chunk = _text_completion_stream_wrapper.convert_to_text_completion_object(chunk)
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks,
is_async=is_async,
)
print("list_streaming_chunks", list_streaming_chunks)
print("complete_streaming_response", complete_streaming_response)
# this is the 2nd chunk - complete_streaming_response should not be None
assert complete_streaming_response is not None
assert len(list_streaming_chunks) == 2
assert isinstance(complete_streaming_response, TextCompletionResponse)
assert isinstance(complete_streaming_response.choices[0], TextChoices)
pass
@pytest.mark.parametrize("is_async", [True, False])
def test_assemble_complete_response_from_streaming_chunks_3(is_async):
request_kwargs = {
"model": "test_model",
"messages": [{"role": "user", "content": "Hello, world!"}],
}
list_streaming_chunks_1 = []
list_streaming_chunks_2 = []
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
delta=litellm.utils.Delta(
content="hello in response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks_1,
is_async=is_async,
)
# this is the 1st chunk - complete_streaming_response should be None
print("list_streaming_chunks_1", list_streaming_chunks_1)
print("complete_streaming_response", complete_streaming_response)
assert complete_streaming_response is None
assert len(list_streaming_chunks_1) == 1
assert list_streaming_chunks_1[0] == chunk
assert len(list_streaming_chunks_2) == 0
# now add a chunk to the 2nd list
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks_2,
is_async=is_async,
)
print("list_streaming_chunks_2", list_streaming_chunks_2)
print("complete_streaming_response", complete_streaming_response)
assert complete_streaming_response is None
assert len(list_streaming_chunks_2) == 1
assert list_streaming_chunks_2[0] == chunk
assert len(list_streaming_chunks_1) == 1
# now add a chunk to the 1st list
@pytest.mark.parametrize("is_async", [True, False])
def test_assemble_complete_response_from_streaming_chunks_4(is_async):
"""
Test 4 - build a complete response when 1 chunk is poorly formatted
- Assert complete_streaming_response is None
- Assert list_streaming_chunks is not empty
"""
request_kwargs = {
"model": "test_model",
"messages": [{"role": "user", "content": "Hello, world!"}],
}
list_streaming_chunks = []
chunk = {
"id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi",
"choices": [
litellm.utils.StreamingChoices(
finish_reason="stop",
delta=litellm.utils.Delta(
content="end of response",
function_call=None,
role=None,
tool_calls=None,
),
index=0,
logprobs=None,
)
],
"created": 1721353246,
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"system_fingerprint": None,
"usage": None,
}
chunk = litellm.ModelResponse(**chunk, stream=True)
# remove attribute id from chunk
del chunk.id
complete_streaming_response = _assemble_complete_response_from_streaming_chunks(
result=chunk,
start_time=datetime.now(),
end_time=datetime.now(),
request_kwargs=request_kwargs,
streaming_chunks=list_streaming_chunks,
is_async=is_async,
)
print("complete_streaming_response", complete_streaming_response)
assert complete_streaming_response is None
print("list_streaming_chunks", list_streaming_chunks)
assert len(list_streaming_chunks) == 1