(feat) Support audio param in responses streaming (#6312)

* add audio, modalities param

* add test for gpt audio models

* add get_supported_openai_params for GPT audio models

* add supported params for audio

* test_audio_output_from_model

* bump openai to openai==1.52.0

* bump openai on pyproject

* fix audio test

* fix test mock_chat_response

* handle audio for Message

* fix handling audio for OAI compatible API endpoints

* fix linting

* fix mock dbrx test

* add audio to Delta

* handle model_response.choices.delta.audio

* fix linting
This commit is contained in:
Ishaan Jaff 2024-10-18 19:16:14 +05:30 committed by GitHub
parent 13e0b3f626
commit a0d45ba516
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 81 additions and 20 deletions

Binary file not shown.

View file

@ -15,40 +15,74 @@ from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
import base64
import requests
def check_non_streaming_response(completion):
assert completion.choices[0].message.audio is not None, "Audio response is missing"
assert isinstance(
completion.choices[0].message.audio, ChatCompletionAudioResponse
), "Invalid audio response type"
assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
async def check_streaming_response(completion):
_audio_bytes = None
_audio_transcript = None
_audio_id = None
async for chunk in completion:
print(chunk)
_choice: StreamingChoices = chunk.choices[0]
if _choice.delta.audio is not None:
if _choice.delta.audio.get("data") is not None:
_audio_bytes = _choice.delta.audio["data"]
if _choice.delta.audio.get("transcript") is not None:
_audio_transcript = _choice.delta.audio["transcript"]
if _choice.delta.audio.get("id") is not None:
_audio_id = _choice.delta.audio["id"]
# Atleast one chunk should have set _audio_bytes, _audio_transcript, _audio_id
assert _audio_bytes is not None
assert _audio_transcript is not None
assert _audio_id is not None
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_audio_output_from_model():
litellm.set_verbose = True
# @pytest.mark.flaky(retries=3, delay=1)
@pytest.mark.parametrize("stream", [True, False])
async def test_audio_output_from_model(stream):
audio_format = "pcm16"
if stream is False:
audio_format = "wav"
litellm.set_verbose = False
completion = await litellm.acompletion(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "wav"},
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=stream,
)
print("response= ", completion)
if stream is True:
await check_streaming_response(completion)
print(completion.choices[0])
assert completion.choices[0].message.audio is not None
assert isinstance(
completion.choices[0].message.audio,
litellm.types.utils.ChatCompletionAudioResponse,
)
assert len(completion.choices[0].message.audio.data) > 0
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open("dog.wav", "wb") as f:
f.write(wav_bytes)
else:
print("response= ", completion)
check_non_streaming_response(completion)
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open("dog.wav", "wb") as f:
f.write(wav_bytes)
@pytest.mark.asyncio
async def test_audio_input_to_model():
@pytest.mark.parametrize("stream", [True, False])
async def test_audio_input_to_model(stream):
# Fetch the audio file and convert it to a base64 encoded string
audio_format = "pcm16"
if stream is False:
audio_format = "wav"
litellm.set_verbose = True
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
@ -58,7 +92,8 @@ async def test_audio_input_to_model():
completion = await litellm.acompletion(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "wav"},
audio={"voice": "alloy", "format": audio_format},
stream=stream,
messages=[
{
"role": "user",
@ -73,4 +108,12 @@ async def test_audio_input_to_model():
],
)
print(completion.choices[0].message)
if stream is True:
await check_streaming_response(completion)
else:
print("response= ", completion)
check_non_streaming_response(completion)
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open("dog.wav", "wb") as f:
f.write(wav_bytes)