mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
fix(utils.py): fix streaming logic
This commit is contained in:
parent
4e608c86c1
commit
4ba18f9932
3 changed files with 87 additions and 31 deletions
|
@ -3678,6 +3678,7 @@ def stream_chunk_builder(
|
||||||
response["usage"]["total_tokens"] = (
|
response["usage"]["total_tokens"] = (
|
||||||
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
||||||
)
|
)
|
||||||
|
|
||||||
return convert_to_model_response_object(
|
return convert_to_model_response_object(
|
||||||
response_object=response,
|
response_object=response,
|
||||||
model_response_object=model_response,
|
model_response_object=model_response,
|
||||||
|
|
|
@ -124,11 +124,12 @@ def test_parallel_function_call():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
test_parallel_function_call()
|
# test_parallel_function_call()
|
||||||
|
|
||||||
|
|
||||||
def test_parallel_function_call_stream():
|
def test_parallel_function_call_stream():
|
||||||
try:
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
# Step 1: send the conversation and available functions to the model
|
# Step 1: send the conversation and available functions to the model
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
|
@ -217,4 +218,4 @@ def test_parallel_function_call_stream():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
test_parallel_function_call_stream()
|
# test_parallel_function_call_stream()
|
||||||
|
|
112
litellm/utils.py
112
litellm/utils.py
|
@ -276,15 +276,14 @@ class Delta(OpenAIObject):
|
||||||
self.content = content
|
self.content = content
|
||||||
self.role = role
|
self.role = role
|
||||||
self.function_call = function_call
|
self.function_call = function_call
|
||||||
if tool_calls is not None:
|
if tool_calls is not None and isinstance(tool_calls, dict):
|
||||||
if isinstance(tool_calls, dict):
|
self.tool_calls = []
|
||||||
self.tool_calls = []
|
for tool_call in tool_calls:
|
||||||
for tool_call in tool_calls:
|
if tool_call.get("index", None) is None:
|
||||||
if tool_call.get("index", None) is None:
|
tool_call["index"] = 0
|
||||||
tool_call["index"] = 0
|
self.tool_calls.append(ChatCompletionDeltaToolCall(**tool_call))
|
||||||
self.tool_calls.append(ChatCompletionDeltaToolCall(**tool_call))
|
else:
|
||||||
else:
|
self.tool_calls = tool_calls
|
||||||
self.tool_calls = tool_calls
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
# Define custom behavior for the 'in' operator
|
# Define custom behavior for the 'in' operator
|
||||||
|
@ -8722,6 +8721,9 @@ class CustomStreamWrapper:
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
delta = dict(original_chunk.choices[0].delta)
|
delta = dict(original_chunk.choices[0].delta)
|
||||||
|
model_response.system_fingerprint = (
|
||||||
|
original_chunk.system_fingerprint
|
||||||
|
)
|
||||||
## AZURE - check if arguments is not None
|
## AZURE - check if arguments is not None
|
||||||
if (
|
if (
|
||||||
original_chunk.choices[0].delta.function_call
|
original_chunk.choices[0].delta.function_call
|
||||||
|
@ -8762,32 +8764,64 @@ class CustomStreamWrapper:
|
||||||
delta = dict(original_chunk.choices[0].delta)
|
delta = dict(original_chunk.choices[0].delta)
|
||||||
print_verbose(f"original delta: {delta}")
|
print_verbose(f"original delta: {delta}")
|
||||||
model_response.choices[0].delta = Delta(**delta)
|
model_response.choices[0].delta = Delta(**delta)
|
||||||
|
print_verbose(
|
||||||
|
f"new delta: {model_response.choices[0].delta}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
model_response.choices[0].delta = Delta()
|
model_response.choices[0].delta = Delta()
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
model_response.system_fingerprint = original_chunk.system_fingerprint
|
print_verbose(
|
||||||
if self.sent_first_chunk == False:
|
f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
|
||||||
model_response.choices[0].delta["role"] = "assistant"
|
)
|
||||||
self.sent_first_chunk = True
|
print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
|
||||||
|
|
||||||
## RETURN ARG
|
## RETURN ARG
|
||||||
if (
|
if (
|
||||||
completion_obj["content"] is not None
|
"content" in completion_obj
|
||||||
or response_obj.get("original_chunk", None) is not None
|
and isinstance(completion_obj["content"], str)
|
||||||
):
|
and len(completion_obj["content"]) > 0
|
||||||
hold = False
|
): # cannot set content of an OpenAI Object to be an empty string
|
||||||
if completion_obj["content"] is not None:
|
hold, model_response_str = self.check_special_tokens(
|
||||||
hold, model_response_str = self.check_special_tokens(
|
chunk=completion_obj["content"],
|
||||||
chunk=completion_obj["content"],
|
finish_reason=model_response.choices[0].finish_reason,
|
||||||
finish_reason=model_response.choices[0].finish_reason,
|
) # filter out bos/eos tokens from openai-compatible hf endpoints
|
||||||
) # filter out bos/eos tokens from openai-compatible hf endpoints
|
print_verbose(
|
||||||
print_verbose(
|
f"hold - {hold}, model_response_str - {model_response_str}"
|
||||||
f"hold - {hold}, model_response_str - {model_response_str}"
|
)
|
||||||
)
|
|
||||||
if hold is False:
|
if hold is False:
|
||||||
|
## check if openai/azure chunk
|
||||||
original_chunk = response_obj.get("original_chunk", None)
|
original_chunk = response_obj.get("original_chunk", None)
|
||||||
if original_chunk is None:
|
if original_chunk:
|
||||||
|
model_response.id = original_chunk.id
|
||||||
|
if len(original_chunk.choices) > 0:
|
||||||
|
try:
|
||||||
|
delta = dict(original_chunk.choices[0].delta)
|
||||||
|
print_verbose(f"original delta: {delta}")
|
||||||
|
model_response.choices[0].delta = Delta(**delta)
|
||||||
|
except Exception as e:
|
||||||
|
model_response.choices[0].delta = Delta()
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
model_response.system_fingerprint = (
|
||||||
|
original_chunk.system_fingerprint
|
||||||
|
)
|
||||||
|
print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
|
||||||
|
if self.sent_first_chunk == False:
|
||||||
|
model_response.choices[0].delta["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
elif self.sent_first_chunk == True and hasattr(
|
||||||
|
model_response.choices[0].delta, "role"
|
||||||
|
):
|
||||||
|
_initial_delta = model_response.choices[
|
||||||
|
0
|
||||||
|
].delta.model_dump()
|
||||||
|
_initial_delta.pop("role", None)
|
||||||
|
model_response.choices[0].delta = Delta(**_initial_delta)
|
||||||
|
print_verbose(
|
||||||
|
f"model_response.choices[0].delta: {model_response.choices[0].delta}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
## else
|
||||||
completion_obj["content"] = model_response_str
|
completion_obj["content"] = model_response_str
|
||||||
if self.sent_first_chunk == False:
|
if self.sent_first_chunk == False:
|
||||||
completion_obj["role"] = "assistant"
|
completion_obj["role"] = "assistant"
|
||||||
|
@ -8812,6 +8846,14 @@ class CustomStreamWrapper:
|
||||||
model_response.choices[0].finish_reason
|
model_response.choices[0].finish_reason
|
||||||
) # ensure consistent output to openai
|
) # ensure consistent output to openai
|
||||||
return model_response
|
return model_response
|
||||||
|
elif (
|
||||||
|
model_response.choices[0].delta.tool_calls is not None
|
||||||
|
or model_response.choices[0].delta.function_call is not None
|
||||||
|
):
|
||||||
|
if self.sent_first_chunk == False:
|
||||||
|
model_response.choices[0].delta["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
return model_response
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
|
@ -8860,7 +8902,14 @@ class CustomStreamWrapper:
|
||||||
print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
|
print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
|
||||||
response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
|
response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
|
||||||
print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
|
print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
|
||||||
if response is None:
|
|
||||||
|
if response is None or (
|
||||||
|
isinstance(response, ModelResponse)
|
||||||
|
and isinstance(response.choices[0], StreamingChoices)
|
||||||
|
and response.choices[0].delta.content is None
|
||||||
|
and response.choices[0].delta.function_call is None
|
||||||
|
and response.choices[0].delta.tool_calls is None
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
## LOGGING
|
## LOGGING
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
|
@ -8904,7 +8953,11 @@ class CustomStreamWrapper:
|
||||||
print_verbose(f"value of async chunk: {chunk}")
|
print_verbose(f"value of async chunk: {chunk}")
|
||||||
if chunk == "None" or chunk is None:
|
if chunk == "None" or chunk is None:
|
||||||
raise Exception
|
raise Exception
|
||||||
elif self.custom_llm_provider == "gemini" and len(chunk.parts) == 0:
|
elif (
|
||||||
|
self.custom_llm_provider == "gemini"
|
||||||
|
and hasattr(chunk, "parts")
|
||||||
|
and len(chunk.parts) == 0
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
# chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
|
# chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
|
||||||
# __anext__ also calls async_success_handler, which does logging
|
# __anext__ also calls async_success_handler, which does logging
|
||||||
|
@ -8933,6 +8986,7 @@ class CustomStreamWrapper:
|
||||||
self.rules.post_call_rules(
|
self.rules.post_call_rules(
|
||||||
input=self.response_uptil_now, model=self.model
|
input=self.response_uptil_now, model=self.model
|
||||||
)
|
)
|
||||||
|
print_verbose(f"final returned processed chunk: {processed_chunk}")
|
||||||
return processed_chunk
|
return processed_chunk
|
||||||
raise StopAsyncIteration
|
raise StopAsyncIteration
|
||||||
else: # temporary patch for non-aiohttp async calls
|
else: # temporary patch for non-aiohttp async calls
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue