fix(utils.py): fix azure streaming bug

This commit is contained in:
Krrish Dholakia 2023-12-04 12:38:15 -08:00
parent 90c13d39ac
commit 728b879c33
4 changed files with 91 additions and 7 deletions

View file

@ -1,3 +1,20 @@
model_list:
- model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
api_key: "os.environ/AZURE_EUROPE_API_KEY"
api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
- model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
api_key: "os.environ/AZURE_CANADA_API_KEY"
api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
- model_name: "azure-model"
litellm_params:
model: "azure/gpt-turbo"
api_key: "os.environ/AZURE-FRANCE-API-KEY"
api_base: "https://openai-france-1234.openai.azure.com"
litellm_settings:
drop_params: True
set_verbose: True

View file

@ -0,0 +1,27 @@
# #### What this tests ####
# # This tests the cost tracking function works with consecutive calls (~10 consecutive calls)
# import sys, os
# import traceback
# import pytest
# sys.path.insert(
# 0, os.path.abspath("../..")
# ) # Adds the parent directory to the system path
# import litellm
# async def test_proxy_cost_tracking():
# """
# Get expected cost.
# Create new key.
# Run 10 parallel calls.
# Check cost for key at the end.
# assert it's = expected cost.
# """
# model = "gpt-3.5-turbo"
# messages = [{"role": "user", "content": "Hey, how's it going?"}]
# number_of_calls = 10
# expected_cost = litellm.completion_cost(model=model, messages=messages) * number_of_calls
# async def litellm_acompletion():

View file

@ -110,4 +110,26 @@ def test_stream_chunk_builder_litellm_tool_call():
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
test_stream_chunk_builder_litellm_tool_call()
# test_stream_chunk_builder_litellm_tool_call()
def test_stream_chunk_builder_litellm_tool_call_regular_message():
try:
messages = [{"role": "user", "content": "Hey, how's it going?"}]
litellm.set_verbose = False
response = litellm.completion(
model="azure/gpt-4-nov-release",
messages=messages,
tools=tools_schema,
stream=True,
api_key="os.environ/AZURE_FRANCE_API_KEY",
api_base="https://openai-france-1234.openai.azure.com",
complete_response = True
)
print(f"complete response: {response}")
print(f"complete response usage: {response.usage}")
assert response.system_fingerprint is not None
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
test_stream_chunk_builder_litellm_tool_call_regular_message()

View file

@ -5266,9 +5266,27 @@ class CustomStreamWrapper:
print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason) # filter out bos/eos tokens from openai-compatible hf endpoints
print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
if hold is False:
## check if openai/azure chunk
original_chunk = response_obj.get("original_chunk", None)
if original_chunk:
model_response.id = original_chunk.id
if len(original_chunk.choices) > 0:
try:
delta = dict(original_chunk.choices[0].delta)
model_response.choices[0].delta = Delta(**delta)
except Exception as e:
model_response.choices[0].delta = Delta()
else:
return
model_response.system_fingerprint = original_chunk.system_fingerprint
if self.sent_first_chunk == False:
model_response.choices[0].delta["role"] = "assistant"
self.sent_first_chunk = True
else:
## else
completion_obj["content"] = model_response_str
if self.sent_first_chunk == False:
completion_obj["role"] = "assistant"