litellm/tests/local_testing/test_stream_chunk_builder.py
Krish Dholakia e9aa492af3
LiteLLM Minor Fixes & Improvement (11/14/2024) (#6730)
* fix(ollama.py): fix get model info request

Fixes https://github.com/BerriAI/litellm/issues/6703

* feat(anthropic/chat/transformation.py): support passing user id to anthropic via openai 'user' param

* docs(anthropic.md): document all supported openai params for anthropic

* test: fix tests

* fix: fix tests

* feat(jina_ai/): add rerank support

Closes https://github.com/BerriAI/litellm/issues/6691

* test: handle service unavailable error

* fix(handler.py): refactor together ai rerank call

* test: update test to handle overloaded error

* test: fix test

* Litellm router trace (#6742)

* feat(router.py): add trace_id to parent functions - allows tracking retry/fallbacks

* feat(router.py): log trace id across retry/fallback logic

allows grouping llm logs for the same request

* test: fix tests

* fix: fix test

* fix(transformation.py): only set non-none stop_sequences

* Litellm router disable fallbacks (#6743)

* bump: version 1.52.6 → 1.52.7

* feat(router.py): enable dynamically disabling fallbacks

Allows for enabling/disabling fallbacks per key

* feat(litellm_pre_call_utils.py): support setting 'disable_fallbacks' on litellm key

* test: fix test

* fix(exception_mapping_utils.py): map 'model is overloaded' to internal server error

* test: handle gemini error

* test: fix test

* fix: new run
2024-11-15 01:02:54 +05:30

732 lines
24 KiB
Python

import asyncio
import os
import sys
import time
import traceback
import pytest
from typing import List
from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
def check_non_streaming_response(completion):
assert completion.choices[0].message.audio is not None, "Audio response is missing"
print("audio", completion.choices[0].message.audio)
assert isinstance(
completion.choices[0].message.audio, ChatCompletionAudioResponse
), "Invalid audio response type"
assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import os
import dotenv
from openai import OpenAI
import litellm
import stream_chunk_testdata
from litellm import completion, stream_chunk_builder
dotenv.load_dotenv()
user_message = "What is the current weather in Boston?"
messages = [{"content": user_message, "role": "user"}]
function_schema = {
"name": "get_weather",
"description": "gets the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
"required": ["location"],
},
}
tools_schema = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
# def test_stream_chunk_builder_tools():
# try:
# litellm.set_verbose = False
# response = client.chat.completions.create(
# model="gpt-3.5-turbo",
# messages=messages,
# tools=tools_schema,
# # stream=True,
# # complete_response=True # runs stream_chunk_builder under-the-hood
# )
# print(f"response: {response}")
# print(f"response usage: {response.usage}")
# except Exception as e:
# pytest.fail(f"An exception occurred - {str(e)}")
# test_stream_chunk_builder_tools()
def test_stream_chunk_builder_litellm_function_call():
try:
litellm.set_verbose = False
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
functions=[function_schema],
# stream=True,
# complete_response=True # runs stream_chunk_builder under-the-hood
)
print(f"response: {response}")
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
# test_stream_chunk_builder_litellm_function_call()
def test_stream_chunk_builder_litellm_tool_call():
try:
litellm.set_verbose = True
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
tools=tools_schema,
stream=True,
complete_response=True,
)
print(f"complete response: {response}")
print(f"complete response usage: {response.usage}")
assert response.usage.completion_tokens > 0
assert response.usage.prompt_tokens > 0
assert (
response.usage.total_tokens
== response.usage.completion_tokens + response.usage.prompt_tokens
)
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
# test_stream_chunk_builder_litellm_tool_call()
def test_stream_chunk_builder_litellm_tool_call_regular_message():
try:
messages = [{"role": "user", "content": "Hey, how's it going?"}]
# litellm.set_verbose = True
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
tools=tools_schema,
stream=True,
complete_response=True,
)
print(f"complete response: {response}")
print(f"complete response usage: {response.usage}")
assert response.usage.completion_tokens > 0
assert response.usage.prompt_tokens > 0
assert (
response.usage.total_tokens
== response.usage.completion_tokens + response.usage.prompt_tokens
)
# check provider is in hidden params
print("hidden params", response._hidden_params)
assert response._hidden_params["custom_llm_provider"] == "openai"
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
# test_stream_chunk_builder_litellm_tool_call_regular_message()
def test_stream_chunk_builder_litellm_usage_chunks():
"""
Checks if stream_chunk_builder is able to correctly rebuild with given metadata from streaming chunks
"""
from litellm.types.utils import Usage
messages = [
{"role": "user", "content": "Tell me the funniest joke you know."},
{
"role": "assistant",
"content": "Why did the chicken cross the road?\nYou will not guess this one I bet\n",
},
{"role": "user", "content": "I do not know, why?"},
{"role": "assistant", "content": "uhhhh\n\n\nhmmmm.....\nthinking....\n"},
{"role": "user", "content": "\nI am waiting...\n\n...\n"},
]
usage: litellm.Usage = Usage(
completion_tokens=27,
prompt_tokens=55,
total_tokens=82,
completion_tokens_details=None,
prompt_tokens_details=None,
)
gemini_pt = usage.prompt_tokens
# make a streaming gemini call
try:
response = completion(
model="gemini/gemini-1.5-flash",
messages=messages,
stream=True,
complete_response=True,
stream_options={"include_usage": True},
)
except litellm.InternalServerError as e:
pytest.skip(f"Skipping test due to internal server error - {str(e)}")
usage: litellm.Usage = response.usage
stream_rebuilt_pt = usage.prompt_tokens
# assert prompt tokens are the same
assert gemini_pt == stream_rebuilt_pt
def test_stream_chunk_builder_litellm_mixed_calls():
response = stream_chunk_builder(stream_chunk_testdata.chunks)
assert (
response.choices[0].message.content
== "To answer your question about how many rows are in the 'users' table, I'll need to run a SQL query. Let me do that for you."
)
print(response.choices[0].message.tool_calls[0].to_dict())
assert len(response.choices[0].message.tool_calls) == 1
assert response.choices[0].message.tool_calls[0].to_dict() == {
"function": {
"arguments": '{"query": "SELECT COUNT(*) FROM users;"}',
"name": "sql_query",
},
"id": "toolu_01H3AjkLpRtGQrof13CBnWfK",
"type": "function",
}
def test_stream_chunk_builder_litellm_empty_chunks():
with pytest.raises(litellm.APIError):
response = stream_chunk_builder(chunks=None)
response = stream_chunk_builder(chunks=[])
assert response is None
def test_stream_chunk_builder_multiple_tool_calls():
init_chunks = [
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"id": "call_X9P9B6STj7ze8OsJCGkfoN94",
"function": {"arguments": "", "name": "exponentiate"},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": '{"ba'},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": 'se": '},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": '3, "ex'},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": "pone"},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": 'nt": '},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": "5}"},
"type": "function",
"index": 0,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"id": "call_Qq8yDeRx7v276abRcLrYORdW",
"function": {"arguments": "", "name": "add"},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": '{"fi'},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": "rst_i"},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": 'nt": 1'},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": '2, "'},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": "secon"},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": 'd_int"'},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"tool_calls": [
{
"function": {"arguments": ": 3}"},
"type": "function",
"index": 1,
}
],
},
}
],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
{
"id": "chatcmpl-A5kCnzaxRsknd6008552ZhDi71yPt",
"choices": [{"finish_reason": "tool_calls", "index": 0, "delta": {}}],
"created": 1725932618,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion.chunk",
"system_fingerprint": "fp_b2ffeb16ee",
},
]
chunks = []
for chunk in init_chunks:
chunks.append(litellm.ModelResponse(**chunk, stream=True))
response = stream_chunk_builder(chunks=chunks)
print(f"Returned response: {response}")
completed_response = {
"id": "chatcmpl-A61mXjvcRX0Xr2IiojN9TPiy1P3Fm",
"choices": [
{
"finish_reason": "tool_calls",
"index": 0,
"message": {
"content": None,
"role": "assistant",
"tool_calls": [
{
"function": {
"arguments": '{"base": 3, "exponent": 5}',
"name": "exponentiate",
},
"id": "call_X9P9B6STj7ze8OsJCGkfoN94",
"type": "function",
},
{
"function": {
"arguments": '{"first_int": 12, "second_int": 3}',
"name": "add",
},
"id": "call_Qq8yDeRx7v276abRcLrYORdW",
"type": "function",
},
],
"function_call": None,
},
}
],
"created": 1726000181,
"model": "gpt-4o-2024-05-13",
"object": "chat.completion",
"system_fingerprint": "fp_25624ae3a5",
"usage": {"completion_tokens": 55, "prompt_tokens": 127, "total_tokens": 182},
"service_tier": None,
}
expected_response = litellm.ModelResponse(**completed_response)
print(f"\n\nexpected_response:\n{expected_response}\n\n")
assert (
expected_response.choices == response.choices
), "\nGot={}\n, Expected={}\n".format(response.choices, expected_response.choices)
def test_stream_chunk_builder_openai_prompt_caching():
from openai import OpenAI
from pydantic import BaseModel
client = OpenAI(
# This is the default and can be omitted
api_key=os.getenv("OPENAI_API_KEY"),
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
stream=True,
stream_options={"include_usage": True},
)
chunks: List[litellm.ModelResponse] = []
usage_obj = None
for chunk in chat_completion:
chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True))
print(f"chunks: {chunks}")
usage_obj: litellm.Usage = chunks[-1].usage # type: ignore
response = stream_chunk_builder(chunks=chunks)
print(f"response: {response}")
print(f"response usage: {response.usage}")
for k, v in usage_obj.model_dump(exclude_none=True).items():
print(k, v)
response_usage_value = getattr(response.usage, k) # type: ignore
print(f"response_usage_value: {response_usage_value}")
print(f"type: {type(response_usage_value)}")
if isinstance(response_usage_value, BaseModel):
assert response_usage_value.model_dump(exclude_none=True) == v
else:
assert response_usage_value == v
def test_stream_chunk_builder_openai_audio_output_usage():
from pydantic import BaseModel
from openai import OpenAI
from typing import Optional
client = OpenAI(
# This is the default and can be omitted
api_key=os.getenv("OPENAI_API_KEY"),
)
completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=True,
stream_options={"include_usage": True},
)
chunks = []
for chunk in completion:
chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True))
usage_obj: Optional[litellm.Usage] = None
for index, chunk in enumerate(chunks):
if hasattr(chunk, "usage"):
usage_obj = chunk.usage
print(f"chunk usage: {chunk.usage}")
print(f"index: {index}")
print(f"len chunks: {len(chunks)}")
print(f"usage_obj: {usage_obj}")
response = stream_chunk_builder(chunks=chunks)
print(f"response usage: {response.usage}")
check_non_streaming_response(response)
print(f"response: {response}")
for k, v in usage_obj.model_dump(exclude_none=True).items():
print(k, v)
response_usage_value = getattr(response.usage, k) # type: ignore
print(f"response_usage_value: {response_usage_value}")
print(f"type: {type(response_usage_value)}")
if isinstance(response_usage_value, BaseModel):
assert response_usage_value.model_dump(exclude_none=True) == v
else:
assert response_usage_value == v