diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index fce45c336..1057ff2a5 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -451,12 +451,18 @@ class Delta(OpenAIObject):
         role=None,
         function_call=None,
         tool_calls=None,
+        audio: Optional[ChatCompletionAudioResponse] = None,
         **params,
     ):
         super(Delta, self).__init__(**params)
         self.content = content
         self.role = role
 
+        # Set default values and correct types
+        self.function_call: Optional[Union[FunctionCall, Any]] = None
+        self.tool_calls: Optional[List[Union[ChatCompletionDeltaToolCall, Any]]] = None
+        self.audio: Optional[ChatCompletionAudioResponse] = None
+
         if function_call is not None and isinstance(function_call, dict):
             self.function_call = FunctionCall(**function_call)
         else:
@@ -473,6 +479,8 @@ class Delta(OpenAIObject):
         else:
             self.tool_calls = tool_calls
 
+        self.audio = audio
+
     def __contains__(self, key):
         # Define custom behavior for the 'in' operator
         return hasattr(self, key)
diff --git a/litellm/utils.py b/litellm/utils.py
index de7d528de..1b4864e39 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -7639,6 +7639,10 @@ class CustomStreamWrapper:
                                 )
                             )
                             model_response.choices[0].delta = Delta()
+                    elif (
+                        delta is not None and getattr(delta, "audio", None) is not None
+                    ):
+                        model_response.choices[0].delta.audio = delta.audio
                     else:
                         try:
                             delta = (
@@ -7805,6 +7809,12 @@ class CustomStreamWrapper:
                     model_response.choices[0].delta["role"] = "assistant"
                     self.sent_first_chunk = True
                 return model_response
+            elif (
+                len(model_response.choices) > 0
+                and hasattr(model_response.choices[0].delta, "audio")
+                and model_response.choices[0].delta.audio is not None
+            ):
+                return model_response
             else:
                 if hasattr(model_response, "usage"):
                     self.chunks.append(model_response)
diff --git a/tests/llm_translation/dog.wav b/tests/llm_translation/dog.wav
index 4baa24f3d..3ca0b533b 100644
Binary files a/tests/llm_translation/dog.wav and b/tests/llm_translation/dog.wav differ
diff --git a/tests/llm_translation/test_gpt4o_audio.py b/tests/llm_translation/test_gpt4o_audio.py
index 16128c2fe..2eae06a44 100644
--- a/tests/llm_translation/test_gpt4o_audio.py
+++ b/tests/llm_translation/test_gpt4o_audio.py
@@ -15,40 +15,74 @@ from respx import MockRouter
 
 import litellm
 from litellm import Choices, Message, ModelResponse
+from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
 import base64
 import requests
 
 
+def check_non_streaming_response(completion):
+    assert completion.choices[0].message.audio is not None, "Audio response is missing"
+    assert isinstance(
+        completion.choices[0].message.audio, ChatCompletionAudioResponse
+    ), "Invalid audio response type"
+    assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
+
+
+async def check_streaming_response(completion):
+    _audio_bytes = None
+    _audio_transcript = None
+    _audio_id = None
+    async for chunk in completion:
+        print(chunk)
+        _choice: StreamingChoices = chunk.choices[0]
+        if _choice.delta.audio is not None:
+            if _choice.delta.audio.get("data") is not None:
+                _audio_bytes = _choice.delta.audio["data"]
+            if _choice.delta.audio.get("transcript") is not None:
+                _audio_transcript = _choice.delta.audio["transcript"]
+            if _choice.delta.audio.get("id") is not None:
+                _audio_id = _choice.delta.audio["id"]
+    # Atleast one chunk should have set _audio_bytes, _audio_transcript, _audio_id
+    assert _audio_bytes is not None
+    assert _audio_transcript is not None
+    assert _audio_id is not None
+
+
 @pytest.mark.asyncio
-@pytest.mark.flaky(retries=3, delay=1)
-async def test_audio_output_from_model():
-    litellm.set_verbose = True
+# @pytest.mark.flaky(retries=3, delay=1)
+@pytest.mark.parametrize("stream", [True, False])
+async def test_audio_output_from_model(stream):
+    audio_format = "pcm16"
+    if stream is False:
+        audio_format = "wav"
+    litellm.set_verbose = False
     completion = await litellm.acompletion(
         model="gpt-4o-audio-preview",
         modalities=["text", "audio"],
-        audio={"voice": "alloy", "format": "wav"},
+        audio={"voice": "alloy", "format": "pcm16"},
         messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+        stream=stream,
     )
 
-    print("response= ", completion)
+    if stream is True:
+        await check_streaming_response(completion)
 
-    print(completion.choices[0])
-
-    assert completion.choices[0].message.audio is not None
-    assert isinstance(
-        completion.choices[0].message.audio,
-        litellm.types.utils.ChatCompletionAudioResponse,
-    )
-    assert len(completion.choices[0].message.audio.data) > 0
-
-    wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
-    with open("dog.wav", "wb") as f:
-        f.write(wav_bytes)
+    else:
+        print("response= ", completion)
+        check_non_streaming_response(completion)
+        wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
+        with open("dog.wav", "wb") as f:
+            f.write(wav_bytes)
 
 
 @pytest.mark.asyncio
-async def test_audio_input_to_model():
+@pytest.mark.parametrize("stream", [True, False])
+async def test_audio_input_to_model(stream):
     # Fetch the audio file and convert it to a base64 encoded string
+    audio_format = "pcm16"
+    if stream is False:
+        audio_format = "wav"
+    litellm.set_verbose = True
     url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
     response = requests.get(url)
     response.raise_for_status()
@@ -58,7 +92,8 @@ async def test_audio_input_to_model():
     completion = await litellm.acompletion(
         model="gpt-4o-audio-preview",
         modalities=["text", "audio"],
-        audio={"voice": "alloy", "format": "wav"},
+        audio={"voice": "alloy", "format": audio_format},
+        stream=stream,
         messages=[
             {
                 "role": "user",
@@ -73,4 +108,12 @@ async def test_audio_input_to_model():
         ],
     )
 
-    print(completion.choices[0].message)
+    if stream is True:
+        await check_streaming_response(completion)
+    else:
+        print("response= ", completion)
+
+        check_non_streaming_response(completion)
+        wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
+        with open("dog.wav", "wb") as f:
+            f.write(wav_bytes)