diff --git a/.circleci/config.yml b/.circleci/config.yml index 1116ad741..c84f5d941 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,7 +47,7 @@ jobs: pip install opentelemetry-api==1.25.0 pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0 - pip install openai==1.51.0 + pip install openai==1.52.0 pip install prisma==0.11.0 pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" @@ -517,7 +517,7 @@ jobs: pip install "aiodynamo==23.10.1" pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" - pip install "openai==1.51.0" + pip install "openai==1.52.0" # Run pytest and generate JUnit XML report - run: name: Build Docker image @@ -634,7 +634,7 @@ jobs: pip install "aiodynamo==23.10.1" pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" - pip install "openai==1.51.0" + pip install "openai==1.52.0" - run: name: Build Docker image command: docker build -t my-app:latest -f ./docker/Dockerfile.database . @@ -726,7 +726,7 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "google-cloud-aiplatform==1.43.0" pip install aiohttp - pip install "openai==1.51.0" + pip install "openai==1.52.0" python -m pip install --upgrade pip pip install "pydantic==2.7.1" pip install "pytest==7.3.1" @@ -921,7 +921,7 @@ jobs: pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" pip install aiohttp - pip install "openai==1.51.0" + pip install "openai==1.52.0" python -m pip install --upgrade pip pip install "pydantic==2.7.1" pip install "pytest==7.3.1" diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index b4e9fcdb3..4912c052c 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -1,5 +1,5 @@ # used by CI/CD testing -openai==1.51.0 +openai==1.52.0 python-dotenv tiktoken importlib_metadata diff --git a/litellm/__init__.py b/litellm/__init__.py index 701d7b23b..35991f421 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -987,10 +987,19 @@ from .llms.mistral.mistral_chat_transformation import MistralConfig from .llms.OpenAI.chat.o1_transformation import ( OpenAIO1Config, ) + +openAIO1Config = OpenAIO1Config() from .llms.OpenAI.chat.gpt_transformation import ( OpenAIGPTConfig, ) +openAIGPTConfig = OpenAIGPTConfig() +from .llms.OpenAI.chat.gpt_audio_transformation import ( + OpenAIGPTAudioConfig, +) + +openAIGPTAudioConfig = OpenAIGPTAudioConfig() + from .llms.nvidia_nim.chat import NvidiaNimConfig from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig diff --git a/litellm/llms/OpenAI/chat/gpt_audio_transformation.py b/litellm/llms/OpenAI/chat/gpt_audio_transformation.py new file mode 100644 index 000000000..59f7dc01e --- /dev/null +++ b/litellm/llms/OpenAI/chat/gpt_audio_transformation.py @@ -0,0 +1,66 @@ +""" +Support for GPT-4o audio Family + +OpenAI Doc: https://platform.openai.com/docs/guides/audio/quickstart?audio-generation-quickstart-example=audio-in&lang=python +""" + +import types +from typing import Optional, Union + +import litellm +from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage + +from .gpt_transformation import OpenAIGPTConfig + + +class OpenAIGPTAudioConfig(OpenAIGPTConfig): + """ + Reference: https://platform.openai.com/docs/guides/audio + """ + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + def get_supported_openai_params(self, model: str) -> list: + """ + Get the supported OpenAI params for the `gpt-audio` models + + """ + + all_openai_params = super().get_supported_openai_params(model=model) + audio_specific_params = ["audio"] + return all_openai_params + audio_specific_params + + def is_model_gpt_audio_model(self, model: str) -> bool: + if model in litellm.open_ai_chat_completion_models and "audio" in model: + return True + return False + + def _map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + return super()._map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + model=model, + drop_params=drop_params, + ) diff --git a/litellm/llms/OpenAI/chat/gpt_transformation.py b/litellm/llms/OpenAI/chat/gpt_transformation.py index 6331322bf..4eced5b1b 100644 --- a/litellm/llms/OpenAI/chat/gpt_transformation.py +++ b/litellm/llms/OpenAI/chat/gpt_transformation.py @@ -93,6 +93,7 @@ class OpenAIGPTConfig: "top_logprobs", "max_tokens", "max_completion_tokens", + "modalities", "n", "presence_penalty", "seed", @@ -131,6 +132,17 @@ class OpenAIGPTConfig: model: str, drop_params: bool, ) -> dict: + """ + If any supported_openai_params are in non_default_params, add them to optional_params, so they are use in API call + + Args: + non_default_params (dict): Non-default parameters to filter. + optional_params (dict): Optional parameters to update. + model (str): Model name for parameter support check. + + Returns: + dict: Updated optional_params with supported non-default parameters. + """ supported_openai_params = self.get_supported_openai_params(model) for param, value in non_default_params.items(): if param in supported_openai_params: diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py index cb118adca..acf6b3003 100644 --- a/litellm/llms/OpenAI/openai.py +++ b/litellm/llms/OpenAI/openai.py @@ -303,10 +303,25 @@ class OpenAIConfig: } def get_supported_openai_params(self, model: str) -> list: - if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model): - return litellm.OpenAIO1Config().get_supported_openai_params(model=model) + """ + This function returns the list of supported openai parameters for a given OpenAI Model + + - If O1 model, returns O1 supported params + - If gpt-audio model, returns gpt-audio supported params + - Else, returns gpt supported params + + Args: + model (str): OpenAI model + + Returns: + list: List of supported openai parameters + """ + if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model): + return litellm.openAIO1Config.get_supported_openai_params(model=model) + elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model): + return litellm.openAIGPTAudioConfig.get_supported_openai_params(model=model) else: - return litellm.OpenAIGPTConfig().get_supported_openai_params(model=model) + return litellm.openAIGPTConfig.get_supported_openai_params(model=model) def _map_openai_params( self, non_default_params: dict, optional_params: dict, model: str @@ -325,14 +340,22 @@ class OpenAIConfig: drop_params: bool, ) -> dict: """ """ - if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model): - return litellm.OpenAIO1Config().map_openai_params( + if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model): + return litellm.openAIO1Config.map_openai_params( non_default_params=non_default_params, optional_params=optional_params, model=model, drop_params=drop_params, ) - return litellm.OpenAIGPTConfig().map_openai_params( + elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model): + return litellm.openAIGPTAudioConfig.map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + model=model, + drop_params=drop_params, + ) + + return litellm.openAIGPTConfig.map_openai_params( non_default_params=non_default_params, optional_params=optional_params, model=model, @@ -666,10 +689,10 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider=custom_llm_provider, ) if ( - litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model) + litellm.openAIO1Config.is_model_o1_reasoning_model(model=model) and messages is not None ): - messages = litellm.OpenAIO1Config().o1_prompt_factory( + messages = litellm.openAIO1Config.o1_prompt_factory( messages=messages, ) diff --git a/litellm/main.py b/litellm/main.py index f93db2a8f..9b6d657a3 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -147,6 +147,8 @@ from .llms.vertex_ai_and_google_ai_studio.vertex_embeddings.embedding_handler im from .llms.watsonx import IBMWatsonXAI from .types.llms.openai import ( ChatCompletionAssistantMessage, + ChatCompletionAudioParam, + ChatCompletionModality, ChatCompletionUserMessage, HttpxBinaryResponseContent, ) @@ -287,6 +289,8 @@ async def acompletion( stop=None, max_tokens: Optional[int] = None, max_completion_tokens: Optional[int] = None, + modalities: Optional[List[ChatCompletionModality]] = None, + audio: Optional[ChatCompletionAudioParam] = None, presence_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None, logit_bias: Optional[dict] = None, @@ -327,6 +331,8 @@ async def acompletion( stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. + modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request. You can use `["text", "audio"]` + audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"] presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. @@ -366,6 +372,8 @@ async def acompletion( "stop": stop, "max_tokens": max_tokens, "max_completion_tokens": max_completion_tokens, + "modalities": modalities, + "audio": audio, "presence_penalty": presence_penalty, "frequency_penalty": frequency_penalty, "logit_bias": logit_bias, @@ -670,6 +678,8 @@ def completion( # type: ignore # noqa: PLR0915 stop=None, max_completion_tokens: Optional[int] = None, max_tokens: Optional[int] = None, + modalities: Optional[List[ChatCompletionModality]] = None, + audio: Optional[ChatCompletionAudioParam] = None, presence_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None, logit_bias: Optional[dict] = None, @@ -712,6 +722,8 @@ def completion( # type: ignore # noqa: PLR0915 stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. + modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request.. You can use `["text", "audio"]` + audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"] presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. @@ -816,6 +828,8 @@ def completion( # type: ignore # noqa: PLR0915 "stream_options", "stop", "max_completion_tokens", + "modalities", + "audio", "max_tokens", "presence_penalty", "frequency_penalty", @@ -975,6 +989,8 @@ def completion( # type: ignore # noqa: PLR0915 stop=stop, max_tokens=max_tokens, max_completion_tokens=max_completion_tokens, + modalities=modalities, + audio=audio, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, @@ -1515,7 +1531,7 @@ def completion( # type: ignore # noqa: PLR0915 ## COMPLETION CALL try: - if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model): + if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model): response = openai_o1_chat_completions.completion( model=model, messages=messages, diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py index 17eb89fd9..0ddf97556 100644 --- a/litellm/types/llms/openai.py +++ b/litellm/types/llms/openai.py @@ -19,6 +19,8 @@ from openai.types.beta.threads.message import Message as OpenAIMessage from openai.types.beta.threads.message_content import MessageContent from openai.types.beta.threads.run import Run from openai.types.chat import ChatCompletionChunk +from openai.types.chat.chat_completion_audio_param import ChatCompletionAudioParam +from openai.types.chat.chat_completion_modality import ChatCompletionModality from openai.types.embedding import Embedding as OpenAIEmbedding from pydantic import BaseModel, Field from typing_extensions import Dict, Required, TypedDict, override diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 409c28458..fce45c336 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -321,6 +321,54 @@ class ChatCompletionMessageToolCall(OpenAIObject): setattr(self, key, value) +class ChatCompletionAudioResponse(OpenAIObject): + def __init__( + self, + data: str, + expires_at: int, + transcript: str, + id: Optional[str] = None, + **params, + ): + super(ChatCompletionAudioResponse, self).__init__(**params) + if id is not None: + self.id = id + else: + self.id = f"{uuid.uuid4()}" + """Unique identifier for this audio response.""" + + self.data = data + """ + Base64 encoded audio bytes generated by the model, in the format specified in + the request. + """ + + self.expires_at = expires_at + """ + The Unix timestamp (in seconds) for when this audio response will no longer be + accessible on the server for use in multi-turn conversations. + """ + + self.transcript = transcript + """Transcript of the audio generated by the model.""" + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + + """ Reference: ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None)) @@ -328,11 +376,11 @@ ChatCompletionMessage(content='This is a test', role='assistant', function_call= class Message(OpenAIObject): - content: Optional[str] role: Literal["assistant", "user", "system", "tool", "function"] tool_calls: Optional[List[ChatCompletionMessageToolCall]] function_call: Optional[FunctionCall] + audio: Optional[ChatCompletionAudioResponse] = None def __init__( self, @@ -340,9 +388,10 @@ class Message(OpenAIObject): role: Literal["assistant"] = "assistant", function_call=None, tool_calls: Optional[list] = None, + audio: Optional[ChatCompletionAudioResponse] = None, **params, ): - init_values = { + init_values: Dict[str, Any] = { "content": content, "role": role or "assistant", # handle null input "function_call": ( @@ -361,11 +410,20 @@ class Message(OpenAIObject): else None ), } + + if audio is not None: + init_values["audio"] = audio + super(Message, self).__init__( **init_values, # type: ignore **params, ) + if audio is None: + # delete audio from self + # OpenAI compatible APIs like mistral API will raise an error if audio is passed in + del self.audio + def get(self, key, default=None): # Custom .get() method to access attributes with a default value if the attribute doesn't exist return getattr(self, key, default) diff --git a/litellm/utils.py b/litellm/utils.py index 49f7fe642..de7d528de 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2483,6 +2483,8 @@ def get_optional_params( # noqa: PLR0915 stop=None, max_tokens=None, max_completion_tokens=None, + modalities=None, + audio=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, @@ -2562,6 +2564,8 @@ def get_optional_params( # noqa: PLR0915 "stop": None, "max_tokens": None, "max_completion_tokens": None, + "modalities": None, + "audio": None, "presence_penalty": None, "frequency_penalty": None, "logit_bias": None, @@ -5734,6 +5738,7 @@ def convert_to_model_response_object( # noqa: PLR0915 role=choice["message"]["role"] or "assistant", function_call=choice["message"].get("function_call", None), tool_calls=tool_calls, + audio=choice["message"].get("audio", None), ) finish_reason = choice.get("finish_reason", None) if finish_reason is None: diff --git a/poetry.lock b/poetry.lock index 42aec43bc..7846ef049 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1823,13 +1823,13 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "openai" -version = "1.51.0" +version = "1.52.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.51.0-py3-none-any.whl", hash = "sha256:d9affafb7e51e5a27dce78589d4964ce4d6f6d560307265933a94b2e3f3c5d2c"}, - {file = "openai-1.51.0.tar.gz", hash = "sha256:8dc4f9d75ccdd5466fc8c99a952186eddceb9fd6ba694044773f3736a847149d"}, + {file = "openai-1.52.0-py3-none-any.whl", hash = "sha256:0c249f20920183b0a2ca4f7dba7b0452df3ecd0fa7985eb1d91ad884bc3ced9c"}, + {file = "openai-1.52.0.tar.gz", hash = "sha256:95c65a5f77559641ab8f3e4c3a050804f7b51d278870e2ec1f7444080bfe565a"}, ] [package.dependencies] @@ -3519,4 +3519,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "cryptography", "fastapi", "fastapi- [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0, !=3.9.7" -content-hash = "94beed60a176d854a59b7cf9cace6f7de83ae6036cbcdb8ed10273df5b299afa" +content-hash = "491d361cabc637f8f896091b92855040da670bb7b311dcbfe75ad20eab97400c" diff --git a/pyproject.toml b/pyproject.toml index 0a9f1246a..df07579f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai" [tool.poetry.dependencies] python = ">=3.8.1,<4.0, !=3.9.7" -openai = ">=1.51.0" +openai = ">=1.52.0" python-dotenv = ">=0.2.0" tiktoken = ">=0.7.0" importlib-metadata = ">=6.8.0" diff --git a/requirements.txt b/requirements.txt index 1cedeeaf7..a08ca5852 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # LITELLM PROXY DEPENDENCIES # anyio==4.4.0 # openai + http req. -openai==1.51.0 # openai req. +openai==1.52.0 # openai req. fastapi==0.111.0 # server dep backoff==2.2.1 # server dep pyyaml==6.0.0 # server dep diff --git a/tests/llm_translation/dog.wav b/tests/llm_translation/dog.wav new file mode 100644 index 000000000..4baa24f3d Binary files /dev/null and b/tests/llm_translation/dog.wav differ diff --git a/tests/llm_translation/test_gpt4o_audio.py b/tests/llm_translation/test_gpt4o_audio.py new file mode 100644 index 000000000..16128c2fe --- /dev/null +++ b/tests/llm_translation/test_gpt4o_audio.py @@ -0,0 +1,76 @@ +import json +import os +import sys +from datetime import datetime +from unittest.mock import AsyncMock + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path + + +import httpx +import pytest +from respx import MockRouter + +import litellm +from litellm import Choices, Message, ModelResponse +import base64 +import requests + + +@pytest.mark.asyncio +@pytest.mark.flaky(retries=3, delay=1) +async def test_audio_output_from_model(): + litellm.set_verbose = True + completion = await litellm.acompletion( + model="gpt-4o-audio-preview", + modalities=["text", "audio"], + audio={"voice": "alloy", "format": "wav"}, + messages=[{"role": "user", "content": "response in 1 word - yes or no"}], + ) + + print("response= ", completion) + + print(completion.choices[0]) + + assert completion.choices[0].message.audio is not None + assert isinstance( + completion.choices[0].message.audio, + litellm.types.utils.ChatCompletionAudioResponse, + ) + assert len(completion.choices[0].message.audio.data) > 0 + + wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) + with open("dog.wav", "wb") as f: + f.write(wav_bytes) + + +@pytest.mark.asyncio +async def test_audio_input_to_model(): + # Fetch the audio file and convert it to a base64 encoded string + url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" + response = requests.get(url) + response.raise_for_status() + wav_data = response.content + encoded_string = base64.b64encode(wav_data).decode("utf-8") + + completion = await litellm.acompletion( + model="gpt-4o-audio-preview", + modalities=["text", "audio"], + audio={"voice": "alloy", "format": "wav"}, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this recording?"}, + { + "type": "input_audio", + "input_audio": {"data": encoded_string, "format": "wav"}, + }, + ], + }, + ], + ) + + print(completion.choices[0].message)