forked from phoenix/litellm-mirror
fix(utils.py): fix azure streaming bug
This commit is contained in:
parent
90c13d39ac
commit
728b879c33
4 changed files with 91 additions and 7 deletions
|
@ -1,3 +1,20 @@
|
||||||
|
model_list:
|
||||||
|
- model_name: "azure-model"
|
||||||
|
litellm_params:
|
||||||
|
model: "azure/gpt-35-turbo"
|
||||||
|
api_key: "os.environ/AZURE_EUROPE_API_KEY"
|
||||||
|
api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
|
||||||
|
- model_name: "azure-model"
|
||||||
|
litellm_params:
|
||||||
|
model: "azure/gpt-35-turbo"
|
||||||
|
api_key: "os.environ/AZURE_CANADA_API_KEY"
|
||||||
|
api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
|
||||||
|
- model_name: "azure-model"
|
||||||
|
litellm_params:
|
||||||
|
model: "azure/gpt-turbo"
|
||||||
|
api_key: "os.environ/AZURE-FRANCE-API-KEY"
|
||||||
|
api_base: "https://openai-france-1234.openai.azure.com"
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
drop_params: True
|
drop_params: True
|
||||||
set_verbose: True
|
set_verbose: True
|
||||||
|
|
27
litellm/tests/test_proxy_server_cost.py
Normal file
27
litellm/tests/test_proxy_server_cost.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
# #### What this tests ####
|
||||||
|
# # This tests the cost tracking function works with consecutive calls (~10 consecutive calls)
|
||||||
|
|
||||||
|
# import sys, os
|
||||||
|
# import traceback
|
||||||
|
# import pytest
|
||||||
|
# sys.path.insert(
|
||||||
|
# 0, os.path.abspath("../..")
|
||||||
|
# ) # Adds the parent directory to the system path
|
||||||
|
# import litellm
|
||||||
|
|
||||||
|
# async def test_proxy_cost_tracking():
|
||||||
|
# """
|
||||||
|
# Get expected cost.
|
||||||
|
# Create new key.
|
||||||
|
# Run 10 parallel calls.
|
||||||
|
# Check cost for key at the end.
|
||||||
|
# assert it's = expected cost.
|
||||||
|
# """
|
||||||
|
# model = "gpt-3.5-turbo"
|
||||||
|
# messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
# number_of_calls = 10
|
||||||
|
# expected_cost = litellm.completion_cost(model=model, messages=messages) * number_of_calls
|
||||||
|
# async def litellm_acompletion():
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -110,4 +110,26 @@ def test_stream_chunk_builder_litellm_tool_call():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred - {str(e)}")
|
pytest.fail(f"An exception occurred - {str(e)}")
|
||||||
|
|
||||||
test_stream_chunk_builder_litellm_tool_call()
|
# test_stream_chunk_builder_litellm_tool_call()
|
||||||
|
|
||||||
|
def test_stream_chunk_builder_litellm_tool_call_regular_message():
|
||||||
|
try:
|
||||||
|
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
litellm.set_verbose = False
|
||||||
|
response = litellm.completion(
|
||||||
|
model="azure/gpt-4-nov-release",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools_schema,
|
||||||
|
stream=True,
|
||||||
|
api_key="os.environ/AZURE_FRANCE_API_KEY",
|
||||||
|
api_base="https://openai-france-1234.openai.azure.com",
|
||||||
|
complete_response = True
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"complete response: {response}")
|
||||||
|
print(f"complete response usage: {response.usage}")
|
||||||
|
assert response.system_fingerprint is not None
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"An exception occurred - {str(e)}")
|
||||||
|
|
||||||
|
test_stream_chunk_builder_litellm_tool_call_regular_message()
|
||||||
|
|
|
@ -5266,14 +5266,32 @@ class CustomStreamWrapper:
|
||||||
print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
|
print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
|
||||||
print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
|
print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
|
||||||
if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
|
if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
|
||||||
hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
|
hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason) # filter out bos/eos tokens from openai-compatible hf endpoints
|
||||||
print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
|
print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
|
||||||
if hold is False:
|
if hold is False:
|
||||||
completion_obj["content"] = model_response_str
|
## check if openai/azure chunk
|
||||||
if self.sent_first_chunk == False:
|
original_chunk = response_obj.get("original_chunk", None)
|
||||||
completion_obj["role"] = "assistant"
|
if original_chunk:
|
||||||
self.sent_first_chunk = True
|
model_response.id = original_chunk.id
|
||||||
model_response.choices[0].delta = Delta(**completion_obj)
|
if len(original_chunk.choices) > 0:
|
||||||
|
try:
|
||||||
|
delta = dict(original_chunk.choices[0].delta)
|
||||||
|
model_response.choices[0].delta = Delta(**delta)
|
||||||
|
except Exception as e:
|
||||||
|
model_response.choices[0].delta = Delta()
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
model_response.system_fingerprint = original_chunk.system_fingerprint
|
||||||
|
if self.sent_first_chunk == False:
|
||||||
|
model_response.choices[0].delta["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
else:
|
||||||
|
## else
|
||||||
|
completion_obj["content"] = model_response_str
|
||||||
|
if self.sent_first_chunk == False:
|
||||||
|
completion_obj["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
model_response.choices[0].delta = Delta(**completion_obj)
|
||||||
# LOGGING
|
# LOGGING
|
||||||
threading.Thread(target=self.logging_obj.success_handler, args=(model_response,)).start()
|
threading.Thread(target=self.logging_obj.success_handler, args=(model_response,)).start()
|
||||||
print_verbose(f"model_response: {model_response}")
|
print_verbose(f"model_response: {model_response}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue