Litellm openai audio streaming (#6325)

* refactor(main.py): streaming_chunk_builder

use <100 lines of code

refactor each component into a separate function - easier to maintain + test

* fix(utils.py): handle choices being None

openai pydantic schema updated

* fix(main.py): fix linting error

* feat(streaming_chunk_builder_utils.py): update stream chunk builder to support rebuilding audio chunks from openai

* test(test_custom_callback_input.py): test message redaction works for audio output

* fix(streaming_chunk_builder_utils.py): return anthropic token usage info directly

* fix(stream_chunk_builder_utils.py): run validation check before entering chunk processor

* fix(main.py): fix import
This commit is contained in:
Krish Dholakia 2024-10-19 16:16:51 -07:00 committed by GitHub
parent 979e8ea526
commit c58d542282
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 638 additions and 282 deletions

View file

@ -6,6 +6,17 @@ import traceback
import pytest
from typing import List
from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
def check_non_streaming_response(completion):
assert completion.choices[0].message.audio is not None, "Audio response is missing"
print("audio", completion.choices[0].message.audio)
assert isinstance(
completion.choices[0].message.audio, ChatCompletionAudioResponse
), "Invalid audio response type"
assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
sys.path.insert(
0, os.path.abspath("../..")
@ -656,12 +667,60 @@ def test_stream_chunk_builder_openai_prompt_caching():
response = stream_chunk_builder(chunks=chunks)
print(f"response: {response}")
print(f"response usage: {response.usage}")
for k, v in usage_obj.model_dump().items():
for k, v in usage_obj.model_dump(exclude_none=True).items():
print(k, v)
response_usage_value = getattr(response.usage, k) # type: ignore
print(f"response_usage_value: {response_usage_value}")
print(f"type: {type(response_usage_value)}")
if isinstance(response_usage_value, BaseModel):
assert response_usage_value.model_dump() == v
assert response_usage_value.model_dump(exclude_none=True) == v
else:
assert response_usage_value == v
def test_stream_chunk_builder_openai_audio_output_usage():
from pydantic import BaseModel
from openai import OpenAI
from typing import Optional
client = OpenAI(
# This is the default and can be omitted
api_key=os.getenv("OPENAI_API_KEY"),
)
completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=True,
stream_options={"include_usage": True},
)
chunks = []
for chunk in completion:
chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True))
usage_obj: Optional[litellm.Usage] = None
for index, chunk in enumerate(chunks):
if hasattr(chunk, "usage"):
usage_obj = chunk.usage
print(f"chunk usage: {chunk.usage}")
print(f"index: {index}")
print(f"len chunks: {len(chunks)}")
print(f"usage_obj: {usage_obj}")
response = stream_chunk_builder(chunks=chunks)
print(f"response usage: {response.usage}")
check_non_streaming_response(response)
print(f"response: {response}")
for k, v in usage_obj.model_dump(exclude_none=True).items():
print(k, v)
response_usage_value = getattr(response.usage, k) # type: ignore
print(f"response_usage_value: {response_usage_value}")
print(f"type: {type(response_usage_value)}")
if isinstance(response_usage_value, BaseModel):
assert response_usage_value.model_dump(exclude_none=True) == v
else:
assert response_usage_value == v