forked from phoenix/litellm-mirror
fix(utils.py): fix the response object returned when n>1 for stream=true
Fixes https://github.com/BerriAI/litellm/issues/3276
This commit is contained in:
parent
5ad91e1277
commit
6c5c7cca3d
3 changed files with 310 additions and 55 deletions
|
@ -1,51 +1,18 @@
|
|||
environment_variables:
|
||||
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
|
||||
general_settings:
|
||||
alerting:
|
||||
- slack
|
||||
alerting_threshold: 300
|
||||
database_connection_pool_limit: 100
|
||||
database_connection_timeout: 60
|
||||
health_check_interval: 300
|
||||
proxy_batch_write_at: 10
|
||||
ui_access_mode: all
|
||||
litellm_settings:
|
||||
allowed_fails: 3
|
||||
failure_callback:
|
||||
- prometheus
|
||||
fallbacks:
|
||||
- gpt-3.5-turbo:
|
||||
- fake-openai-endpoint
|
||||
- gpt-4
|
||||
num_retries: 3
|
||||
service_callback:
|
||||
- prometheus_system
|
||||
success_callback:
|
||||
- prometheus
|
||||
model_list:
|
||||
- litellm_params:
|
||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||
api_key: my-fake-key
|
||||
model: openai/my-fake-model
|
||||
model_name: fake-openai-endpoint
|
||||
- litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
model_name: gpt-3.5-turbo
|
||||
- model_name: llama-3
|
||||
- model_name: text-embedding-3-small
|
||||
litellm_params:
|
||||
model: replicate/meta/meta-llama-3-8b-instruct
|
||||
router_settings:
|
||||
allowed_fails: 3
|
||||
context_window_fallbacks: null
|
||||
cooldown_time: 1
|
||||
fallbacks:
|
||||
- gpt-3.5-turbo:
|
||||
- fake-openai-endpoint
|
||||
- gpt-4
|
||||
- gpt-3.5-turbo-3:
|
||||
- fake-openai-endpoint
|
||||
num_retries: 3
|
||||
retry_after: 0
|
||||
routing_strategy: simple-shuffle
|
||||
routing_strategy_args: {}
|
||||
timeout: 6000
|
||||
model: text-embedding-3-small
|
||||
- model_name: whisper
|
||||
litellm_params:
|
||||
model: azure/azure-whisper
|
||||
api_version: 2024-02-15-preview
|
||||
api_base: os.environ/AZURE_EUROPE_API_BASE
|
||||
api_key: os.environ/AZURE_EUROPE_API_KEY
|
||||
model_info:
|
||||
mode: audio_transcription
|
||||
- litellm_params:
|
||||
model: gpt-4
|
||||
model_name: gpt-4
|
||||
|
||||
# litellm_settings:
|
||||
# cache: True
|
|
@ -2446,6 +2446,34 @@ class ModelResponseIterator:
|
|||
return self.model_response
|
||||
|
||||
|
||||
class ModelResponseListIterator:
|
||||
def __init__(self, model_responses):
|
||||
self.model_responses = model_responses
|
||||
self.index = 0
|
||||
|
||||
# Sync iterator
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.index >= len(self.model_responses):
|
||||
raise StopIteration
|
||||
model_response = self.model_responses[self.index]
|
||||
self.index += 1
|
||||
return model_response
|
||||
|
||||
# Async iterator
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.index >= len(self.model_responses):
|
||||
raise StopAsyncIteration
|
||||
model_response = self.model_responses[self.index]
|
||||
self.index += 1
|
||||
return model_response
|
||||
|
||||
|
||||
def test_unit_test_custom_stream_wrapper():
|
||||
"""
|
||||
Test if last streaming chunk ends with '?', if the message repeats itself.
|
||||
|
@ -2486,3 +2514,259 @@ def test_unit_test_custom_stream_wrapper():
|
|||
if "How are you?" in chunk.choices[0].delta.content:
|
||||
freq += 1
|
||||
assert freq == 1
|
||||
|
||||
|
||||
chunks = [
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "It"},
|
||||
"logprobs": {
|
||||
"content": [
|
||||
{
|
||||
"token": "It",
|
||||
"logprob": -1.5952516,
|
||||
"bytes": [73, 116],
|
||||
"top_logprobs": [
|
||||
{
|
||||
"token": "Brown",
|
||||
"logprob": -0.7358765,
|
||||
"bytes": [66, 114, 111, 119, 110],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 1,
|
||||
"delta": {"content": "Brown"},
|
||||
"logprobs": {
|
||||
"content": [
|
||||
{
|
||||
"token": "Brown",
|
||||
"logprob": -0.7358765,
|
||||
"bytes": [66, 114, 111, 119, 110],
|
||||
"top_logprobs": [
|
||||
{
|
||||
"token": "Brown",
|
||||
"logprob": -0.7358765,
|
||||
"bytes": [66, 114, 111, 119, 110],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "'s"},
|
||||
"logprobs": {
|
||||
"content": [
|
||||
{
|
||||
"token": "'s",
|
||||
"logprob": -0.006786893,
|
||||
"bytes": [39, 115],
|
||||
"top_logprobs": [
|
||||
{
|
||||
"token": "'s",
|
||||
"logprob": -0.006786893,
|
||||
"bytes": [39, 115],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": " impossible"},
|
||||
"logprobs": {
|
||||
"content": [
|
||||
{
|
||||
"token": " impossible",
|
||||
"logprob": -0.06528423,
|
||||
"bytes": [
|
||||
32,
|
||||
105,
|
||||
109,
|
||||
112,
|
||||
111,
|
||||
115,
|
||||
115,
|
||||
105,
|
||||
98,
|
||||
108,
|
||||
101,
|
||||
],
|
||||
"top_logprobs": [
|
||||
{
|
||||
"token": " impossible",
|
||||
"logprob": -0.06528423,
|
||||
"bytes": [
|
||||
32,
|
||||
105,
|
||||
109,
|
||||
112,
|
||||
111,
|
||||
115,
|
||||
115,
|
||||
105,
|
||||
98,
|
||||
108,
|
||||
101,
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "—even"},
|
||||
"logprobs": {
|
||||
"content": [
|
||||
{
|
||||
"token": "—even",
|
||||
"logprob": -9999.0,
|
||||
"bytes": [226, 128, 148, 101, 118, 101, 110],
|
||||
"top_logprobs": [
|
||||
{
|
||||
"token": " to",
|
||||
"logprob": -0.12302828,
|
||||
"bytes": [32, 116, 111],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1714075272,
|
||||
"model": "gpt-4-0613",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_unit_test_custom_stream_wrapper_n():
|
||||
"""
|
||||
Test if the translated output maps exactly to the received openai input
|
||||
|
||||
Relevant issue: https://github.com/BerriAI/litellm/issues/3276
|
||||
"""
|
||||
litellm.set_verbose = False
|
||||
|
||||
chunk_list = []
|
||||
for chunk in chunks:
|
||||
_chunk = litellm.ModelResponse(**chunk, stream=True)
|
||||
chunk_list.append(_chunk)
|
||||
|
||||
completion_stream = ModelResponseListIterator(model_responses=chunk_list)
|
||||
|
||||
response = litellm.CustomStreamWrapper(
|
||||
completion_stream=completion_stream,
|
||||
model="gpt-4-0613",
|
||||
custom_llm_provider="cached_response",
|
||||
logging_obj=litellm.Logging(
|
||||
model="gpt-4-0613",
|
||||
messages=[{"role": "user", "content": "Hey"}],
|
||||
stream=True,
|
||||
call_type="completion",
|
||||
start_time=time.time(),
|
||||
litellm_call_id="12345",
|
||||
function_id="1245",
|
||||
),
|
||||
)
|
||||
|
||||
for idx, chunk in enumerate(response):
|
||||
chunk_dict = {}
|
||||
try:
|
||||
chunk_dict = chunk.model_dump(exclude_none=True)
|
||||
except:
|
||||
chunk_dict = chunk.dict(exclude_none=True)
|
||||
|
||||
chunk_dict.pop("created")
|
||||
chunks[idx].pop("created")
|
||||
if chunks[idx]["system_fingerprint"] is None:
|
||||
chunks[idx].pop("system_fingerprint", None)
|
||||
if idx == 0:
|
||||
for choice in chunk_dict["choices"]:
|
||||
if "role" in choice["delta"]:
|
||||
choice["delta"].pop("role")
|
||||
|
||||
for choice in chunks[idx]["choices"]:
|
||||
# ignore finish reason None - since our pydantic object is set to exclude_none = true
|
||||
if "finish_reason" in choice and choice["finish_reason"] is None:
|
||||
choice.pop("finish_reason")
|
||||
if "logprobs" in choice and choice["logprobs"] is None:
|
||||
choice.pop("logprobs")
|
||||
|
||||
assert (
|
||||
chunk_dict == chunks[idx]
|
||||
), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"
|
||||
|
|
|
@ -19,6 +19,7 @@ from functools import wraps
|
|||
import datetime, time
|
||||
import tiktoken
|
||||
import uuid
|
||||
from pydantic import BaseModel
|
||||
import aiohttp
|
||||
import textwrap
|
||||
import logging
|
||||
|
@ -10120,12 +10121,15 @@ class CustomStreamWrapper:
|
|||
model_response.id = original_chunk.id
|
||||
self.response_id = original_chunk.id
|
||||
if len(original_chunk.choices) > 0:
|
||||
choices = []
|
||||
for idx, choice in enumerate(original_chunk.choices):
|
||||
try:
|
||||
delta = dict(original_chunk.choices[0].delta)
|
||||
print_verbose(f"original delta: {delta}")
|
||||
model_response.choices[0].delta = Delta(**delta)
|
||||
if isinstance(choice, BaseModel):
|
||||
choice_json = choice.model_dump()
|
||||
choices.append(StreamingChoices(**choice_json))
|
||||
except Exception as e:
|
||||
model_response.choices[0].delta = Delta()
|
||||
choices.append(StreamingChoices())
|
||||
model_response.choices = choices
|
||||
else:
|
||||
return
|
||||
model_response.system_fingerprint = (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue