(feat) Support audio, modalities params (#6304)

* add audio, modalities param * add test for gpt audio models * add get_supported_openai_params for GPT audio models * add supported params for audio * test_audio_output_from_model * bump openai to openai==1.52.0 * bump openai on pyproject * fix audio test * fix test mock_chat_response * handle audio for Message * fix handling audio for OAI compatible API endpoints * fix linting * fix mock dbrx test
2024-10-18 19:14:25 +05:30 · 2024-10-18 19:14:25 +05:30 · 13e0b3f626
commit 13e0b3f626
parent e35fc3203e
15 changed files with 290 additions and 23 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -47,7 +47,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.51.0
+            pip install openai==1.52.0
            pip install prisma==0.11.0   
            pip install "detect_secrets==1.5.0"         
            pip install "httpx==0.24.1"
@ -517,7 +517,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.51.0"
+            pip install "openai==1.52.0"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -634,7 +634,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.51.0"
+            pip install "openai==1.52.0"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -726,7 +726,7 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.51.0"
+            pip install "openai==1.52.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
@ -921,7 +921,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.51.0"
+            pip install "openai==1.52.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,5 +1,5 @@
 # used by CI/CD testing
-openai==1.51.0
+openai==1.52.0
 python-dotenv
 tiktoken
 importlib_metadata
--- a/litellm/init.py
+++ b/litellm/init.py
@ -987,10 +987,19 @@ from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.OpenAI.chat.o1_transformation import (
    OpenAIO1Config,
 )
+
+openAIO1Config = OpenAIO1Config()
 from .llms.OpenAI.chat.gpt_transformation import (
    OpenAIGPTConfig,
 )

+openAIGPTConfig = OpenAIGPTConfig()
+from .llms.OpenAI.chat.gpt_audio_transformation import (
+    OpenAIGPTAudioConfig,
+)
+
+openAIGPTAudioConfig = OpenAIGPTAudioConfig()
+
 from .llms.nvidia_nim.chat import NvidiaNimConfig
 from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig

--- a/litellm/llms/OpenAI/chat/gpt_audio_transformation.py
+++ b/litellm/llms/OpenAI/chat/gpt_audio_transformation.py
@ -0,0 +1,66 @@
+"""
+Support for GPT-4o audio Family
+
+OpenAI Doc: https://platform.openai.com/docs/guides/audio/quickstart?audio-generation-quickstart-example=audio-in&lang=python
+"""
+
+import types
+from typing import Optional, Union
+
+import litellm
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage
+
+from .gpt_transformation import OpenAIGPTConfig
+
+
+class OpenAIGPTAudioConfig(OpenAIGPTConfig):
+    """
+    Reference: https://platform.openai.com/docs/guides/audio
+    """
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self, model: str) -> list:
+        """
+        Get the supported OpenAI params for the `gpt-audio` models
+
+        """
+
+        all_openai_params = super().get_supported_openai_params(model=model)
+        audio_specific_params = ["audio"]
+        return all_openai_params + audio_specific_params
+
+    def is_model_gpt_audio_model(self, model: str) -> bool:
+        if model in litellm.open_ai_chat_completion_models and "audio" in model:
+            return True
+        return False
+
+    def _map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        return super()._map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            model=model,
+            drop_params=drop_params,
+        )
--- a/litellm/llms/OpenAI/chat/gpt_transformation.py
+++ b/litellm/llms/OpenAI/chat/gpt_transformation.py
@ -93,6 +93,7 @@ class OpenAIGPTConfig:
            "top_logprobs",
            "max_tokens",
            "max_completion_tokens",
+            "modalities",
            "n",
            "presence_penalty",
            "seed",
@ -131,6 +132,17 @@ class OpenAIGPTConfig:
        model: str,
        drop_params: bool,
    ) -> dict:
+        """
+        If any supported_openai_params are in non_default_params, add them to optional_params, so they are use in API call
+
+        Args:
+            non_default_params (dict): Non-default parameters to filter.
+            optional_params (dict): Optional parameters to update.
+            model (str): Model name for parameter support check.
+
+        Returns:
+            dict: Updated optional_params with supported non-default parameters.
+        """
        supported_openai_params = self.get_supported_openai_params(model)
        for param, value in non_default_params.items():
            if param in supported_openai_params:
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@ -303,10 +303,25 @@ class OpenAIConfig:
        }

    def get_supported_openai_params(self, model: str) -> list:
-        if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
-            return litellm.OpenAIO1Config().get_supported_openai_params(model=model)
+        """
+        This function returns the list of supported openai parameters for a given OpenAI Model
+
+        - If O1 model, returns O1 supported params
+        - If gpt-audio model, returns gpt-audio supported params
+        - Else, returns gpt supported params
+
+        Args:
+            model (str): OpenAI model
+
+        Returns:
+            list: List of supported openai parameters
+        """
+        if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
+            return litellm.openAIO1Config.get_supported_openai_params(model=model)
+        elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model):
+            return litellm.openAIGPTAudioConfig.get_supported_openai_params(model=model)
        else:
-            return litellm.OpenAIGPTConfig().get_supported_openai_params(model=model)
+            return litellm.openAIGPTConfig.get_supported_openai_params(model=model)

    def _map_openai_params(
        self, non_default_params: dict, optional_params: dict, model: str
@ -325,14 +340,22 @@ class OpenAIConfig:
        drop_params: bool,
    ) -> dict:
        """ """
-        if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
-            return litellm.OpenAIO1Config().map_openai_params(
+        if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
+            return litellm.openAIO1Config.map_openai_params(
                non_default_params=non_default_params,
                optional_params=optional_params,
                model=model,
                drop_params=drop_params,
            )
-        return litellm.OpenAIGPTConfig().map_openai_params(
+        elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model):
+            return litellm.openAIGPTAudioConfig.map_openai_params(
+                non_default_params=non_default_params,
+                optional_params=optional_params,
+                model=model,
+                drop_params=drop_params,
+            )
+
+        return litellm.openAIGPTConfig.map_openai_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
            model=model,
@ -666,10 +689,10 @@ class OpenAIChatCompletion(BaseLLM):
                        custom_llm_provider=custom_llm_provider,
                    )
            if (
-                litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model)
+                litellm.openAIO1Config.is_model_o1_reasoning_model(model=model)
                and messages is not None
            ):
-                messages = litellm.OpenAIO1Config().o1_prompt_factory(
+                messages = litellm.openAIO1Config.o1_prompt_factory(
                    messages=messages,
                )

--- a/litellm/main.py
+++ b/litellm/main.py
@ -147,6 +147,8 @@ from .llms.vertex_ai_and_google_ai_studio.vertex_embeddings.embedding_handler im
 from .llms.watsonx import IBMWatsonXAI
 from .types.llms.openai import (
    ChatCompletionAssistantMessage,
+    ChatCompletionAudioParam,
+    ChatCompletionModality,
    ChatCompletionUserMessage,
    HttpxBinaryResponseContent,
 )
@ -287,6 +289,8 @@ async def acompletion(
    stop=None,
    max_tokens: Optional[int] = None,
    max_completion_tokens: Optional[int] = None,
+    modalities: Optional[List[ChatCompletionModality]] = None,
+    audio: Optional[ChatCompletionAudioParam] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    logit_bias: Optional[dict] = None,
@ -327,6 +331,8 @@ async def acompletion(
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
+        modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request. You can use `["text", "audio"]`
+        audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -366,6 +372,8 @@ async def acompletion(
        "stop": stop,
        "max_tokens": max_tokens,
        "max_completion_tokens": max_completion_tokens,
+        "modalities": modalities,
+        "audio": audio,
        "presence_penalty": presence_penalty,
        "frequency_penalty": frequency_penalty,
        "logit_bias": logit_bias,
@ -670,6 +678,8 @@ def completion(  # type: ignore # noqa: PLR0915
    stop=None,
    max_completion_tokens: Optional[int] = None,
    max_tokens: Optional[int] = None,
+    modalities: Optional[List[ChatCompletionModality]] = None,
+    audio: Optional[ChatCompletionAudioParam] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    logit_bias: Optional[dict] = None,
@ -712,6 +722,8 @@ def completion(  # type: ignore # noqa: PLR0915
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
+        modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request.. You can use `["text", "audio"]`
+        audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -816,6 +828,8 @@ def completion(  # type: ignore # noqa: PLR0915
        "stream_options",
        "stop",
        "max_completion_tokens",
+        "modalities",
+        "audio",
        "max_tokens",
        "presence_penalty",
        "frequency_penalty",
@ -975,6 +989,8 @@ def completion(  # type: ignore # noqa: PLR0915
            stop=stop,
            max_tokens=max_tokens,
            max_completion_tokens=max_completion_tokens,
+            modalities=modalities,
+            audio=audio,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
@ -1515,7 +1531,7 @@ def completion(  # type: ignore # noqa: PLR0915

            ## COMPLETION CALL
            try:
-                if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
+                if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
                    response = openai_o1_chat_completions.completion(
                        model=model,
                        messages=messages,
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -19,6 +19,8 @@ from openai.types.beta.threads.message import Message as OpenAIMessage
 from openai.types.beta.threads.message_content import MessageContent
 from openai.types.beta.threads.run import Run
 from openai.types.chat import ChatCompletionChunk
+from openai.types.chat.chat_completion_audio_param import ChatCompletionAudioParam
+from openai.types.chat.chat_completion_modality import ChatCompletionModality
 from openai.types.embedding import Embedding as OpenAIEmbedding
 from pydantic import BaseModel, Field
 from typing_extensions import Dict, Required, TypedDict, override
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -321,6 +321,54 @@ class ChatCompletionMessageToolCall(OpenAIObject):
        setattr(self, key, value)


+class ChatCompletionAudioResponse(OpenAIObject):
+    def __init__(
+        self,
+        data: str,
+        expires_at: int,
+        transcript: str,
+        id: Optional[str] = None,
+        **params,
+    ):
+        super(ChatCompletionAudioResponse, self).__init__(**params)
+        if id is not None:
+            self.id = id
+        else:
+            self.id = f"{uuid.uuid4()}"
+        """Unique identifier for this audio response."""
+
+        self.data = data
+        """
+        Base64 encoded audio bytes generated by the model, in the format specified in
+        the request.
+        """
+
+        self.expires_at = expires_at
+        """
+        The Unix timestamp (in seconds) for when this audio response will no longer be
+        accessible on the server for use in multi-turn conversations.
+        """
+
+        self.transcript = transcript
+        """Transcript of the audio generated by the model."""
+
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+
+
 """
 Reference:
 ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
@ -328,11 +376,11 @@ ChatCompletionMessage(content='This is a test', role='assistant', function_call=


 class Message(OpenAIObject):
-
    content: Optional[str]
    role: Literal["assistant", "user", "system", "tool", "function"]
    tool_calls: Optional[List[ChatCompletionMessageToolCall]]
    function_call: Optional[FunctionCall]
+    audio: Optional[ChatCompletionAudioResponse] = None

    def __init__(
        self,
@ -340,9 +388,10 @@ class Message(OpenAIObject):
        role: Literal["assistant"] = "assistant",
        function_call=None,
        tool_calls: Optional[list] = None,
+        audio: Optional[ChatCompletionAudioResponse] = None,
        **params,
    ):
-        init_values = {
+        init_values: Dict[str, Any] = {
            "content": content,
            "role": role or "assistant",  # handle null input
            "function_call": (
@ -361,11 +410,20 @@ class Message(OpenAIObject):
                else None
            ),
        }
+
+        if audio is not None:
+            init_values["audio"] = audio
+
        super(Message, self).__init__(
            **init_values,  # type: ignore
            **params,
        )

+        if audio is None:
+            # delete audio from self
+            # OpenAI compatible APIs like mistral API will raise an error if audio is passed in
+            del self.audio
+
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
        return getattr(self, key, default)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2483,6 +2483,8 @@ def get_optional_params(  # noqa: PLR0915
    stop=None,
    max_tokens=None,
    max_completion_tokens=None,
+    modalities=None,
+    audio=None,
    presence_penalty=None,
    frequency_penalty=None,
    logit_bias=None,
@ -2562,6 +2564,8 @@ def get_optional_params(  # noqa: PLR0915
        "stop": None,
        "max_tokens": None,
        "max_completion_tokens": None,
+        "modalities": None,
+        "audio": None,
        "presence_penalty": None,
        "frequency_penalty": None,
        "logit_bias": None,
@ -5734,6 +5738,7 @@ def convert_to_model_response_object(  # noqa: PLR0915
                        role=choice["message"]["role"] or "assistant",
                        function_call=choice["message"].get("function_call", None),
                        tool_calls=tool_calls,
+                        audio=choice["message"].get("audio", None),
                    )
                    finish_reason = choice.get("finish_reason", None)
                if finish_reason is None:
--- a/poetry.lock
+++ b/poetry.lock
@ -1823,13 +1823,13 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]

 [[package]]
 name = "openai"
-version = "1.51.0"
+version = "1.52.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.51.0-py3-none-any.whl", hash = "sha256:d9affafb7e51e5a27dce78589d4964ce4d6f6d560307265933a94b2e3f3c5d2c"},
-    {file = "openai-1.51.0.tar.gz", hash = "sha256:8dc4f9d75ccdd5466fc8c99a952186eddceb9fd6ba694044773f3736a847149d"},
+    {file = "openai-1.52.0-py3-none-any.whl", hash = "sha256:0c249f20920183b0a2ca4f7dba7b0452df3ecd0fa7985eb1d91ad884bc3ced9c"},
+    {file = "openai-1.52.0.tar.gz", hash = "sha256:95c65a5f77559641ab8f3e4c3a050804f7b51d278870e2ec1f7444080bfe565a"},
 ]

 [package.dependencies]
@ -3519,4 +3519,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "cryptography", "fastapi", "fastapi-
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
-content-hash = "94beed60a176d854a59b7cf9cace6f7de83ae6036cbcdb8ed10273df5b299afa"
+content-hash = "491d361cabc637f8f896091b92855040da670bb7b311dcbfe75ad20eab97400c"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"

 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0, !=3.9.7"
-openai = ">=1.51.0"
+openai = ">=1.52.0"
 python-dotenv = ">=0.2.0"
 tiktoken = ">=0.7.0"
 importlib-metadata = ">=6.8.0"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 # LITELLM PROXY DEPENDENCIES #
 anyio==4.4.0 # openai + http req.
-openai==1.51.0 # openai req. 
+openai==1.52.0 # openai req. 
 fastapi==0.111.0 # server dep
 backoff==2.2.1 # server dep
 pyyaml==6.0.0 # server dep
--- a/tests/llm_translation/dog.wav
+++ b/tests/llm_translation/dog.wav
--- a/tests/llm_translation/test_gpt4o_audio.py
+++ b/tests/llm_translation/test_gpt4o_audio.py
@ -0,0 +1,76 @@
+import json
+import os
+import sys
+from datetime import datetime
+from unittest.mock import AsyncMock
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+
+import httpx
+import pytest
+from respx import MockRouter
+
+import litellm
+from litellm import Choices, Message, ModelResponse
+import base64
+import requests
+
+
+@pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
+async def test_audio_output_from_model():
+    litellm.set_verbose = True
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "wav"},
+        messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+    )
+
+    print("response= ", completion)
+
+    print(completion.choices[0])
+
+    assert completion.choices[0].message.audio is not None
+    assert isinstance(
+        completion.choices[0].message.audio,
+        litellm.types.utils.ChatCompletionAudioResponse,
+    )
+    assert len(completion.choices[0].message.audio.data) > 0
+
+    wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
+    with open("dog.wav", "wb") as f:
+        f.write(wav_bytes)
+
+
+@pytest.mark.asyncio
+async def test_audio_input_to_model():
+    # Fetch the audio file and convert it to a base64 encoded string
+    url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+    response = requests.get(url)
+    response.raise_for_status()
+    wav_data = response.content
+    encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "wav"},
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this recording?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": encoded_string, "format": "wav"},
+                    },
+                ],
+            },
+        ],
+    )
+
+    print(completion.choices[0].message)