diff --git a/litellm/types/utils.py b/litellm/types/utils.py index fce45c336..1057ff2a5 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -451,12 +451,18 @@ class Delta(OpenAIObject): role=None, function_call=None, tool_calls=None, + audio: Optional[ChatCompletionAudioResponse] = None, **params, ): super(Delta, self).__init__(**params) self.content = content self.role = role + # Set default values and correct types + self.function_call: Optional[Union[FunctionCall, Any]] = None + self.tool_calls: Optional[List[Union[ChatCompletionDeltaToolCall, Any]]] = None + self.audio: Optional[ChatCompletionAudioResponse] = None + if function_call is not None and isinstance(function_call, dict): self.function_call = FunctionCall(**function_call) else: @@ -473,6 +479,8 @@ class Delta(OpenAIObject): else: self.tool_calls = tool_calls + self.audio = audio + def __contains__(self, key): # Define custom behavior for the 'in' operator return hasattr(self, key) diff --git a/litellm/utils.py b/litellm/utils.py index de7d528de..1b4864e39 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7639,6 +7639,10 @@ class CustomStreamWrapper: ) ) model_response.choices[0].delta = Delta() + elif ( + delta is not None and getattr(delta, "audio", None) is not None + ): + model_response.choices[0].delta.audio = delta.audio else: try: delta = ( @@ -7805,6 +7809,12 @@ class CustomStreamWrapper: model_response.choices[0].delta["role"] = "assistant" self.sent_first_chunk = True return model_response + elif ( + len(model_response.choices) > 0 + and hasattr(model_response.choices[0].delta, "audio") + and model_response.choices[0].delta.audio is not None + ): + return model_response else: if hasattr(model_response, "usage"): self.chunks.append(model_response) diff --git a/tests/llm_translation/dog.wav b/tests/llm_translation/dog.wav index 4baa24f3d..3ca0b533b 100644 Binary files a/tests/llm_translation/dog.wav and b/tests/llm_translation/dog.wav differ diff --git a/tests/llm_translation/test_gpt4o_audio.py b/tests/llm_translation/test_gpt4o_audio.py index 16128c2fe..2eae06a44 100644 --- a/tests/llm_translation/test_gpt4o_audio.py +++ b/tests/llm_translation/test_gpt4o_audio.py @@ -15,40 +15,74 @@ from respx import MockRouter import litellm from litellm import Choices, Message, ModelResponse +from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse import base64 import requests +def check_non_streaming_response(completion): + assert completion.choices[0].message.audio is not None, "Audio response is missing" + assert isinstance( + completion.choices[0].message.audio, ChatCompletionAudioResponse + ), "Invalid audio response type" + assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty" + + +async def check_streaming_response(completion): + _audio_bytes = None + _audio_transcript = None + _audio_id = None + async for chunk in completion: + print(chunk) + _choice: StreamingChoices = chunk.choices[0] + if _choice.delta.audio is not None: + if _choice.delta.audio.get("data") is not None: + _audio_bytes = _choice.delta.audio["data"] + if _choice.delta.audio.get("transcript") is not None: + _audio_transcript = _choice.delta.audio["transcript"] + if _choice.delta.audio.get("id") is not None: + _audio_id = _choice.delta.audio["id"] + # Atleast one chunk should have set _audio_bytes, _audio_transcript, _audio_id + assert _audio_bytes is not None + assert _audio_transcript is not None + assert _audio_id is not None + + @pytest.mark.asyncio -@pytest.mark.flaky(retries=3, delay=1) -async def test_audio_output_from_model(): - litellm.set_verbose = True +# @pytest.mark.flaky(retries=3, delay=1) +@pytest.mark.parametrize("stream", [True, False]) +async def test_audio_output_from_model(stream): + audio_format = "pcm16" + if stream is False: + audio_format = "wav" + litellm.set_verbose = False completion = await litellm.acompletion( model="gpt-4o-audio-preview", modalities=["text", "audio"], - audio={"voice": "alloy", "format": "wav"}, + audio={"voice": "alloy", "format": "pcm16"}, messages=[{"role": "user", "content": "response in 1 word - yes or no"}], + stream=stream, ) - print("response= ", completion) + if stream is True: + await check_streaming_response(completion) - print(completion.choices[0]) - - assert completion.choices[0].message.audio is not None - assert isinstance( - completion.choices[0].message.audio, - litellm.types.utils.ChatCompletionAudioResponse, - ) - assert len(completion.choices[0].message.audio.data) > 0 - - wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) - with open("dog.wav", "wb") as f: - f.write(wav_bytes) + else: + print("response= ", completion) + check_non_streaming_response(completion) + wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) + with open("dog.wav", "wb") as f: + f.write(wav_bytes) @pytest.mark.asyncio -async def test_audio_input_to_model(): +@pytest.mark.parametrize("stream", [True, False]) +async def test_audio_input_to_model(stream): # Fetch the audio file and convert it to a base64 encoded string + audio_format = "pcm16" + if stream is False: + audio_format = "wav" + litellm.set_verbose = True url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" response = requests.get(url) response.raise_for_status() @@ -58,7 +92,8 @@ async def test_audio_input_to_model(): completion = await litellm.acompletion( model="gpt-4o-audio-preview", modalities=["text", "audio"], - audio={"voice": "alloy", "format": "wav"}, + audio={"voice": "alloy", "format": audio_format}, + stream=stream, messages=[ { "role": "user", @@ -73,4 +108,12 @@ async def test_audio_input_to_model(): ], ) - print(completion.choices[0].message) + if stream is True: + await check_streaming_response(completion) + else: + print("response= ", completion) + + check_non_streaming_response(completion) + wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) + with open("dog.wav", "wb") as f: + f.write(wav_bytes)