(feat) Support audio, modalities params (#6304)

* add audio, modalities param * add test for gpt audio models * add get_supported_openai_params for GPT audio models * add supported params for audio * test_audio_output_from_model * bump openai to openai==1.52.0 * bump openai on pyproject * fix audio test * fix test mock_chat_response * handle audio for Message * fix handling audio for OAI compatible API endpoints * fix linting * fix mock dbrx test
2024-10-18 19:14:25 +05:30 · 2024-10-18 19:14:25 +05:30 · 13e0b3f626
commit 13e0b3f626
parent e35fc3203e
15 changed files with 290 additions and 23 deletions
--- a/tests/llm_translation/dog.wav
+++ b/tests/llm_translation/dog.wav
--- a/tests/llm_translation/test_gpt4o_audio.py
+++ b/tests/llm_translation/test_gpt4o_audio.py
@ -0,0 +1,76 @@
+import json
+import os
+import sys
+from datetime import datetime
+from unittest.mock import AsyncMock
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+
+import httpx
+import pytest
+from respx import MockRouter
+
+import litellm
+from litellm import Choices, Message, ModelResponse
+import base64
+import requests
+
+
+@pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
+async def test_audio_output_from_model():
+    litellm.set_verbose = True
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "wav"},
+        messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+    )
+
+    print("response= ", completion)
+
+    print(completion.choices[0])
+
+    assert completion.choices[0].message.audio is not None
+    assert isinstance(
+        completion.choices[0].message.audio,
+        litellm.types.utils.ChatCompletionAudioResponse,
+    )
+    assert len(completion.choices[0].message.audio.data) > 0
+
+    wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
+    with open("dog.wav", "wb") as f:
+        f.write(wav_bytes)
+
+
+@pytest.mark.asyncio
+async def test_audio_input_to_model():
+    # Fetch the audio file and convert it to a base64 encoded string
+    url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+    response = requests.get(url)
+    response.raise_for_status()
+    wav_data = response.content
+    encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "wav"},
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this recording?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": encoded_string, "format": "wav"},
+                    },
+                ],
+            },
+        ],
+    )
+
+    print(completion.choices[0].message)