fix(utils.py): fix the response object returned when n>1 for stream=true

Fixes https://github.com/BerriAI/litellm/issues/3276
This commit is contained in:
Krrish Dholakia 2024-04-25 13:27:29 -07:00
parent 5ad91e1277
commit 6c5c7cca3d
3 changed files with 310 additions and 55 deletions

View file

@ -1,51 +1,18 @@
environment_variables:
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
general_settings:
alerting:
- slack
alerting_threshold: 300
database_connection_pool_limit: 100
database_connection_timeout: 60
health_check_interval: 300
proxy_batch_write_at: 10
ui_access_mode: all
litellm_settings:
allowed_fails: 3
failure_callback:
- prometheus
fallbacks:
- gpt-3.5-turbo:
- fake-openai-endpoint
- gpt-4
num_retries: 3
service_callback:
- prometheus_system
success_callback:
- prometheus
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
- litellm_params:
model: gpt-3.5-turbo
model_name: gpt-3.5-turbo
- model_name: llama-3
- model_name: text-embedding-3-small
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
router_settings:
allowed_fails: 3
context_window_fallbacks: null
cooldown_time: 1
fallbacks:
- gpt-3.5-turbo:
- fake-openai-endpoint
- gpt-4
- gpt-3.5-turbo-3:
- fake-openai-endpoint
num_retries: 3
retry_after: 0
routing_strategy: simple-shuffle
routing_strategy_args: {}
timeout: 6000
model: text-embedding-3-small
- model_name: whisper
litellm_params:
model: azure/azure-whisper
api_version: 2024-02-15-preview
api_base: os.environ/AZURE_EUROPE_API_BASE
api_key: os.environ/AZURE_EUROPE_API_KEY
model_info:
mode: audio_transcription
- litellm_params:
model: gpt-4
model_name: gpt-4
# litellm_settings:
# cache: True

View file

@ -2446,6 +2446,34 @@ class ModelResponseIterator:
return self.model_response
class ModelResponseListIterator:
def __init__(self, model_responses):
self.model_responses = model_responses
self.index = 0
# Sync iterator
def __iter__(self):
return self
def __next__(self):
if self.index >= len(self.model_responses):
raise StopIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
# Async iterator
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.model_responses):
raise StopAsyncIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
def test_unit_test_custom_stream_wrapper():
"""
Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2514,259 @@ def test_unit_test_custom_stream_wrapper():
if "How are you?" in chunk.choices[0].delta.content:
freq += 1
assert freq == 1
chunks = [
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "It"},
"logprobs": {
"content": [
{
"token": "It",
"logprob": -1.5952516,
"bytes": [73, 116],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 1,
"delta": {"content": "Brown"},
"logprobs": {
"content": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "'s"},
"logprobs": {
"content": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
"top_logprobs": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": " impossible"},
"logprobs": {
"content": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
"top_logprobs": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "—even"},
"logprobs": {
"content": [
{
"token": "—even",
"logprob": -9999.0,
"bytes": [226, 128, 148, 101, 118, 101, 110],
"top_logprobs": [
{
"token": " to",
"logprob": -0.12302828,
"bytes": [32, 116, 111],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
],
},
]
def test_unit_test_custom_stream_wrapper_n():
"""
Test if the translated output maps exactly to the received openai input
Relevant issue: https://github.com/BerriAI/litellm/issues/3276
"""
litellm.set_verbose = False
chunk_list = []
for chunk in chunks:
_chunk = litellm.ModelResponse(**chunk, stream=True)
chunk_list.append(_chunk)
completion_stream = ModelResponseListIterator(model_responses=chunk_list)
response = litellm.CustomStreamWrapper(
completion_stream=completion_stream,
model="gpt-4-0613",
custom_llm_provider="cached_response",
logging_obj=litellm.Logging(
model="gpt-4-0613",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
),
)
for idx, chunk in enumerate(response):
chunk_dict = {}
try:
chunk_dict = chunk.model_dump(exclude_none=True)
except:
chunk_dict = chunk.dict(exclude_none=True)
chunk_dict.pop("created")
chunks[idx].pop("created")
if chunks[idx]["system_fingerprint"] is None:
chunks[idx].pop("system_fingerprint", None)
if idx == 0:
for choice in chunk_dict["choices"]:
if "role" in choice["delta"]:
choice["delta"].pop("role")
for choice in chunks[idx]["choices"]:
# ignore finish reason None - since our pydantic object is set to exclude_none = true
if "finish_reason" in choice and choice["finish_reason"] is None:
choice.pop("finish_reason")
if "logprobs" in choice and choice["logprobs"] is None:
choice.pop("logprobs")
assert (
chunk_dict == chunks[idx]
), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"

View file

@ -19,6 +19,7 @@ from functools import wraps
import datetime, time
import tiktoken
import uuid
from pydantic import BaseModel
import aiohttp
import textwrap
import logging
@ -10120,12 +10121,15 @@ class CustomStreamWrapper:
model_response.id = original_chunk.id
self.response_id = original_chunk.id
if len(original_chunk.choices) > 0:
choices = []
for idx, choice in enumerate(original_chunk.choices):
try:
delta = dict(original_chunk.choices[0].delta)
print_verbose(f"original delta: {delta}")
model_response.choices[0].delta = Delta(**delta)
if isinstance(choice, BaseModel):
choice_json = choice.model_dump()
choices.append(StreamingChoices(**choice_json))
except Exception as e:
model_response.choices[0].delta = Delta()
choices.append(StreamingChoices())
model_response.choices = choices
else:
return
model_response.system_fingerprint = (