litellm-mirror/tests/llm_translation/test_gpt4o_audio.py
Krish Dholakia 5c107c64dd
Add gemini audio input support + handle special tokens in sagemaker response (#9640)
* fix(internal_user_endpoints.py): cleanup unused variables on beta endpoint

no team/org split on daily user endpoint

* build(model_prices_and_context_window.json): gemini-2.0-flash supports audio input

* feat(gemini/transformation.py): support passing audio input to gemini

* test: fix test

* fix(gemini/transformation.py): support audio input as a url

enables passing google cloud bucket urls

* fix(gemini/transformation.py): support explicitly passing format of file

* fix(gemini/transformation.py): expand support for inferred file types from url

* fix(sagemaker/completion/transformation.py): fix special token error when counting sagemaker tokens

* test: fix import
2025-03-29 19:23:09 -07:00

134 lines
4.6 KiB
Python

import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
import base64
import requests
def check_non_streaming_response(completion):
assert completion.choices[0].message.audio is not None, "Audio response is missing"
assert isinstance(
completion.choices[0].message.audio, ChatCompletionAudioResponse
), "Invalid audio response type"
assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
async def check_streaming_response(completion):
_audio_bytes = None
_audio_transcript = None
_audio_id = None
async for chunk in completion:
print(chunk)
_choice: StreamingChoices = chunk.choices[0]
if _choice.delta.audio is not None:
if _choice.delta.audio.get("data") is not None:
_audio_bytes = _choice.delta.audio["data"]
if _choice.delta.audio.get("transcript") is not None:
_audio_transcript = _choice.delta.audio["transcript"]
if _choice.delta.audio.get("id") is not None:
_audio_id = _choice.delta.audio["id"]
# Atleast one chunk should have set _audio_bytes, _audio_transcript, _audio_id
assert _audio_bytes is not None
assert _audio_transcript is not None
assert _audio_id is not None
@pytest.mark.asyncio
# @pytest.mark.flaky(retries=3, delay=1)
@pytest.mark.parametrize("stream", [True, False])
async def test_audio_output_from_model(stream):
audio_format = "pcm16"
if stream is False:
audio_format = "wav"
litellm.set_verbose = False
try:
completion = await litellm.acompletion(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=stream,
)
except litellm.Timeout as e:
print(e)
pytest.skip("Skipping test due to timeout")
except Exception as e:
if "openai-internal" in str(e):
pytest.skip("Skipping test due to openai-internal error")
if stream is True:
await check_streaming_response(completion)
else:
print("response= ", completion)
check_non_streaming_response(completion)
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open("dog.wav", "wb") as f:
f.write(wav_bytes)
@pytest.mark.asyncio
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("model", ["gpt-4o-audio-preview"]) # "gpt-4o-audio-preview",
async def test_audio_input_to_model(stream, model):
# Fetch the audio file and convert it to a base64 encoded string
audio_format = "pcm16"
if stream is False:
audio_format = "wav"
litellm._turn_on_debug()
litellm.drop_params = True
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
wav_data = response.content
encoded_string = base64.b64encode(wav_data).decode("utf-8")
try:
completion = await litellm.acompletion(
model=model,
modalities=["text", "audio"],
audio={"voice": "alloy", "format": audio_format},
stream=stream,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What is in this recording?"},
{
"type": "input_audio",
"input_audio": {"data": encoded_string, "format": "wav"},
},
],
},
],
)
except litellm.Timeout as e:
print(e)
pytest.skip("Skipping test due to timeout")
except Exception as e:
if "openai-internal" in str(e):
pytest.skip("Skipping test due to openai-internal error")
raise e
if stream is True:
await check_streaming_response(completion)
else:
print("response= ", completion)
check_non_streaming_response(completion)
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open("dog.wav", "wb") as f:
f.write(wav_bytes)