forked from phoenix/litellm-mirror
(feat) Support audio
, modalities
params (#6304)
* add audio, modalities param * add test for gpt audio models * add get_supported_openai_params for GPT audio models * add supported params for audio * test_audio_output_from_model * bump openai to openai==1.52.0 * bump openai on pyproject * fix audio test * fix test mock_chat_response * handle audio for Message * fix handling audio for OAI compatible API endpoints * fix linting * fix mock dbrx test
This commit is contained in:
parent
e35fc3203e
commit
13e0b3f626
15 changed files with 290 additions and 23 deletions
|
@ -47,7 +47,7 @@ jobs:
|
||||||
pip install opentelemetry-api==1.25.0
|
pip install opentelemetry-api==1.25.0
|
||||||
pip install opentelemetry-sdk==1.25.0
|
pip install opentelemetry-sdk==1.25.0
|
||||||
pip install opentelemetry-exporter-otlp==1.25.0
|
pip install opentelemetry-exporter-otlp==1.25.0
|
||||||
pip install openai==1.51.0
|
pip install openai==1.52.0
|
||||||
pip install prisma==0.11.0
|
pip install prisma==0.11.0
|
||||||
pip install "detect_secrets==1.5.0"
|
pip install "detect_secrets==1.5.0"
|
||||||
pip install "httpx==0.24.1"
|
pip install "httpx==0.24.1"
|
||||||
|
@ -517,7 +517,7 @@ jobs:
|
||||||
pip install "aiodynamo==23.10.1"
|
pip install "aiodynamo==23.10.1"
|
||||||
pip install "asyncio==3.4.3"
|
pip install "asyncio==3.4.3"
|
||||||
pip install "PyGithub==1.59.1"
|
pip install "PyGithub==1.59.1"
|
||||||
pip install "openai==1.51.0"
|
pip install "openai==1.52.0"
|
||||||
# Run pytest and generate JUnit XML report
|
# Run pytest and generate JUnit XML report
|
||||||
- run:
|
- run:
|
||||||
name: Build Docker image
|
name: Build Docker image
|
||||||
|
@ -634,7 +634,7 @@ jobs:
|
||||||
pip install "aiodynamo==23.10.1"
|
pip install "aiodynamo==23.10.1"
|
||||||
pip install "asyncio==3.4.3"
|
pip install "asyncio==3.4.3"
|
||||||
pip install "PyGithub==1.59.1"
|
pip install "PyGithub==1.59.1"
|
||||||
pip install "openai==1.51.0"
|
pip install "openai==1.52.0"
|
||||||
- run:
|
- run:
|
||||||
name: Build Docker image
|
name: Build Docker image
|
||||||
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
|
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
|
||||||
|
@ -726,7 +726,7 @@ jobs:
|
||||||
pip install "pytest-asyncio==0.21.1"
|
pip install "pytest-asyncio==0.21.1"
|
||||||
pip install "google-cloud-aiplatform==1.43.0"
|
pip install "google-cloud-aiplatform==1.43.0"
|
||||||
pip install aiohttp
|
pip install aiohttp
|
||||||
pip install "openai==1.51.0"
|
pip install "openai==1.52.0"
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install "pydantic==2.7.1"
|
pip install "pydantic==2.7.1"
|
||||||
pip install "pytest==7.3.1"
|
pip install "pytest==7.3.1"
|
||||||
|
@ -921,7 +921,7 @@ jobs:
|
||||||
pip install "pytest-retry==1.6.3"
|
pip install "pytest-retry==1.6.3"
|
||||||
pip install "pytest-asyncio==0.21.1"
|
pip install "pytest-asyncio==0.21.1"
|
||||||
pip install aiohttp
|
pip install aiohttp
|
||||||
pip install "openai==1.51.0"
|
pip install "openai==1.52.0"
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install "pydantic==2.7.1"
|
pip install "pydantic==2.7.1"
|
||||||
pip install "pytest==7.3.1"
|
pip install "pytest==7.3.1"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# used by CI/CD testing
|
# used by CI/CD testing
|
||||||
openai==1.51.0
|
openai==1.52.0
|
||||||
python-dotenv
|
python-dotenv
|
||||||
tiktoken
|
tiktoken
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
|
|
|
@ -987,10 +987,19 @@ from .llms.mistral.mistral_chat_transformation import MistralConfig
|
||||||
from .llms.OpenAI.chat.o1_transformation import (
|
from .llms.OpenAI.chat.o1_transformation import (
|
||||||
OpenAIO1Config,
|
OpenAIO1Config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
openAIO1Config = OpenAIO1Config()
|
||||||
from .llms.OpenAI.chat.gpt_transformation import (
|
from .llms.OpenAI.chat.gpt_transformation import (
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
openAIGPTConfig = OpenAIGPTConfig()
|
||||||
|
from .llms.OpenAI.chat.gpt_audio_transformation import (
|
||||||
|
OpenAIGPTAudioConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
openAIGPTAudioConfig = OpenAIGPTAudioConfig()
|
||||||
|
|
||||||
from .llms.nvidia_nim.chat import NvidiaNimConfig
|
from .llms.nvidia_nim.chat import NvidiaNimConfig
|
||||||
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
|
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
|
||||||
|
|
||||||
|
|
66
litellm/llms/OpenAI/chat/gpt_audio_transformation.py
Normal file
66
litellm/llms/OpenAI/chat/gpt_audio_transformation.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
"""
|
||||||
|
Support for GPT-4o audio Family
|
||||||
|
|
||||||
|
OpenAI Doc: https://platform.openai.com/docs/guides/audio/quickstart?audio-generation-quickstart-example=audio-in&lang=python
|
||||||
|
"""
|
||||||
|
|
||||||
|
import types
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage
|
||||||
|
|
||||||
|
from .gpt_transformation import OpenAIGPTConfig
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIGPTAudioConfig(OpenAIGPTConfig):
|
||||||
|
"""
|
||||||
|
Reference: https://platform.openai.com/docs/guides/audio
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(self, model: str) -> list:
|
||||||
|
"""
|
||||||
|
Get the supported OpenAI params for the `gpt-audio` models
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
all_openai_params = super().get_supported_openai_params(model=model)
|
||||||
|
audio_specific_params = ["audio"]
|
||||||
|
return all_openai_params + audio_specific_params
|
||||||
|
|
||||||
|
def is_model_gpt_audio_model(self, model: str) -> bool:
|
||||||
|
if model in litellm.open_ai_chat_completion_models and "audio" in model:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _map_openai_params(
|
||||||
|
self,
|
||||||
|
non_default_params: dict,
|
||||||
|
optional_params: dict,
|
||||||
|
model: str,
|
||||||
|
drop_params: bool,
|
||||||
|
) -> dict:
|
||||||
|
return super()._map_openai_params(
|
||||||
|
non_default_params=non_default_params,
|
||||||
|
optional_params=optional_params,
|
||||||
|
model=model,
|
||||||
|
drop_params=drop_params,
|
||||||
|
)
|
|
@ -93,6 +93,7 @@ class OpenAIGPTConfig:
|
||||||
"top_logprobs",
|
"top_logprobs",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"max_completion_tokens",
|
"max_completion_tokens",
|
||||||
|
"modalities",
|
||||||
"n",
|
"n",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"seed",
|
"seed",
|
||||||
|
@ -131,6 +132,17 @@ class OpenAIGPTConfig:
|
||||||
model: str,
|
model: str,
|
||||||
drop_params: bool,
|
drop_params: bool,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
"""
|
||||||
|
If any supported_openai_params are in non_default_params, add them to optional_params, so they are use in API call
|
||||||
|
|
||||||
|
Args:
|
||||||
|
non_default_params (dict): Non-default parameters to filter.
|
||||||
|
optional_params (dict): Optional parameters to update.
|
||||||
|
model (str): Model name for parameter support check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Updated optional_params with supported non-default parameters.
|
||||||
|
"""
|
||||||
supported_openai_params = self.get_supported_openai_params(model)
|
supported_openai_params = self.get_supported_openai_params(model)
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param in supported_openai_params:
|
if param in supported_openai_params:
|
||||||
|
|
|
@ -303,10 +303,25 @@ class OpenAIConfig:
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_supported_openai_params(self, model: str) -> list:
|
def get_supported_openai_params(self, model: str) -> list:
|
||||||
if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
|
"""
|
||||||
return litellm.OpenAIO1Config().get_supported_openai_params(model=model)
|
This function returns the list of supported openai parameters for a given OpenAI Model
|
||||||
|
|
||||||
|
- If O1 model, returns O1 supported params
|
||||||
|
- If gpt-audio model, returns gpt-audio supported params
|
||||||
|
- Else, returns gpt supported params
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str): OpenAI model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of supported openai parameters
|
||||||
|
"""
|
||||||
|
if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
|
||||||
|
return litellm.openAIO1Config.get_supported_openai_params(model=model)
|
||||||
|
elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model):
|
||||||
|
return litellm.openAIGPTAudioConfig.get_supported_openai_params(model=model)
|
||||||
else:
|
else:
|
||||||
return litellm.OpenAIGPTConfig().get_supported_openai_params(model=model)
|
return litellm.openAIGPTConfig.get_supported_openai_params(model=model)
|
||||||
|
|
||||||
def _map_openai_params(
|
def _map_openai_params(
|
||||||
self, non_default_params: dict, optional_params: dict, model: str
|
self, non_default_params: dict, optional_params: dict, model: str
|
||||||
|
@ -325,14 +340,22 @@ class OpenAIConfig:
|
||||||
drop_params: bool,
|
drop_params: bool,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
""" """
|
""" """
|
||||||
if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
|
if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
|
||||||
return litellm.OpenAIO1Config().map_openai_params(
|
return litellm.openAIO1Config.map_openai_params(
|
||||||
non_default_params=non_default_params,
|
non_default_params=non_default_params,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
model=model,
|
model=model,
|
||||||
drop_params=drop_params,
|
drop_params=drop_params,
|
||||||
)
|
)
|
||||||
return litellm.OpenAIGPTConfig().map_openai_params(
|
elif litellm.openAIGPTAudioConfig.is_model_gpt_audio_model(model=model):
|
||||||
|
return litellm.openAIGPTAudioConfig.map_openai_params(
|
||||||
|
non_default_params=non_default_params,
|
||||||
|
optional_params=optional_params,
|
||||||
|
model=model,
|
||||||
|
drop_params=drop_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
return litellm.openAIGPTConfig.map_openai_params(
|
||||||
non_default_params=non_default_params,
|
non_default_params=non_default_params,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -666,10 +689,10 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model)
|
litellm.openAIO1Config.is_model_o1_reasoning_model(model=model)
|
||||||
and messages is not None
|
and messages is not None
|
||||||
):
|
):
|
||||||
messages = litellm.OpenAIO1Config().o1_prompt_factory(
|
messages = litellm.openAIO1Config.o1_prompt_factory(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -147,6 +147,8 @@ from .llms.vertex_ai_and_google_ai_studio.vertex_embeddings.embedding_handler im
|
||||||
from .llms.watsonx import IBMWatsonXAI
|
from .llms.watsonx import IBMWatsonXAI
|
||||||
from .types.llms.openai import (
|
from .types.llms.openai import (
|
||||||
ChatCompletionAssistantMessage,
|
ChatCompletionAssistantMessage,
|
||||||
|
ChatCompletionAudioParam,
|
||||||
|
ChatCompletionModality,
|
||||||
ChatCompletionUserMessage,
|
ChatCompletionUserMessage,
|
||||||
HttpxBinaryResponseContent,
|
HttpxBinaryResponseContent,
|
||||||
)
|
)
|
||||||
|
@ -287,6 +289,8 @@ async def acompletion(
|
||||||
stop=None,
|
stop=None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
max_completion_tokens: Optional[int] = None,
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
modalities: Optional[List[ChatCompletionModality]] = None,
|
||||||
|
audio: Optional[ChatCompletionAudioParam] = None,
|
||||||
presence_penalty: Optional[float] = None,
|
presence_penalty: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
logit_bias: Optional[dict] = None,
|
logit_bias: Optional[dict] = None,
|
||||||
|
@ -327,6 +331,8 @@ async def acompletion(
|
||||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||||
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
||||||
|
modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request. You can use `["text", "audio"]`
|
||||||
|
audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
|
||||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||||
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
||||||
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
||||||
|
@ -366,6 +372,8 @@ async def acompletion(
|
||||||
"stop": stop,
|
"stop": stop,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"max_completion_tokens": max_completion_tokens,
|
"max_completion_tokens": max_completion_tokens,
|
||||||
|
"modalities": modalities,
|
||||||
|
"audio": audio,
|
||||||
"presence_penalty": presence_penalty,
|
"presence_penalty": presence_penalty,
|
||||||
"frequency_penalty": frequency_penalty,
|
"frequency_penalty": frequency_penalty,
|
||||||
"logit_bias": logit_bias,
|
"logit_bias": logit_bias,
|
||||||
|
@ -670,6 +678,8 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
stop=None,
|
stop=None,
|
||||||
max_completion_tokens: Optional[int] = None,
|
max_completion_tokens: Optional[int] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
|
modalities: Optional[List[ChatCompletionModality]] = None,
|
||||||
|
audio: Optional[ChatCompletionAudioParam] = None,
|
||||||
presence_penalty: Optional[float] = None,
|
presence_penalty: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
logit_bias: Optional[dict] = None,
|
logit_bias: Optional[dict] = None,
|
||||||
|
@ -712,6 +722,8 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||||
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
||||||
|
modalities (List[ChatCompletionModality], optional): Output types that you would like the model to generate for this request.. You can use `["text", "audio"]`
|
||||||
|
audio (ChatCompletionAudioParam, optional): Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
|
||||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||||
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
||||||
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
||||||
|
@ -816,6 +828,8 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
"stream_options",
|
"stream_options",
|
||||||
"stop",
|
"stop",
|
||||||
"max_completion_tokens",
|
"max_completion_tokens",
|
||||||
|
"modalities",
|
||||||
|
"audio",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
|
@ -975,6 +989,8 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
stop=stop,
|
stop=stop,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
max_completion_tokens=max_completion_tokens,
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
modalities=modalities,
|
||||||
|
audio=audio,
|
||||||
presence_penalty=presence_penalty,
|
presence_penalty=presence_penalty,
|
||||||
frequency_penalty=frequency_penalty,
|
frequency_penalty=frequency_penalty,
|
||||||
logit_bias=logit_bias,
|
logit_bias=logit_bias,
|
||||||
|
@ -1515,7 +1531,7 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
|
|
||||||
## COMPLETION CALL
|
## COMPLETION CALL
|
||||||
try:
|
try:
|
||||||
if litellm.OpenAIO1Config().is_model_o1_reasoning_model(model=model):
|
if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
|
||||||
response = openai_o1_chat_completions.completion(
|
response = openai_o1_chat_completions.completion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
|
|
@ -19,6 +19,8 @@ from openai.types.beta.threads.message import Message as OpenAIMessage
|
||||||
from openai.types.beta.threads.message_content import MessageContent
|
from openai.types.beta.threads.message_content import MessageContent
|
||||||
from openai.types.beta.threads.run import Run
|
from openai.types.beta.threads.run import Run
|
||||||
from openai.types.chat import ChatCompletionChunk
|
from openai.types.chat import ChatCompletionChunk
|
||||||
|
from openai.types.chat.chat_completion_audio_param import ChatCompletionAudioParam
|
||||||
|
from openai.types.chat.chat_completion_modality import ChatCompletionModality
|
||||||
from openai.types.embedding import Embedding as OpenAIEmbedding
|
from openai.types.embedding import Embedding as OpenAIEmbedding
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing_extensions import Dict, Required, TypedDict, override
|
from typing_extensions import Dict, Required, TypedDict, override
|
||||||
|
|
|
@ -321,6 +321,54 @@ class ChatCompletionMessageToolCall(OpenAIObject):
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionAudioResponse(OpenAIObject):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
data: str,
|
||||||
|
expires_at: int,
|
||||||
|
transcript: str,
|
||||||
|
id: Optional[str] = None,
|
||||||
|
**params,
|
||||||
|
):
|
||||||
|
super(ChatCompletionAudioResponse, self).__init__(**params)
|
||||||
|
if id is not None:
|
||||||
|
self.id = id
|
||||||
|
else:
|
||||||
|
self.id = f"{uuid.uuid4()}"
|
||||||
|
"""Unique identifier for this audio response."""
|
||||||
|
|
||||||
|
self.data = data
|
||||||
|
"""
|
||||||
|
Base64 encoded audio bytes generated by the model, in the format specified in
|
||||||
|
the request.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.expires_at = expires_at
|
||||||
|
"""
|
||||||
|
The Unix timestamp (in seconds) for when this audio response will no longer be
|
||||||
|
accessible on the server for use in multi-turn conversations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.transcript = transcript
|
||||||
|
"""Transcript of the audio generated by the model."""
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
# Define custom behavior for the 'in' operator
|
||||||
|
return hasattr(self, key)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
||||||
|
return getattr(self, key, default)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
# Allow dictionary-style access to attributes
|
||||||
|
return getattr(self, key)
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
# Allow dictionary-style assignment of attributes
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Reference:
|
Reference:
|
||||||
ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
|
ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
|
||||||
|
@ -328,11 +376,11 @@ ChatCompletionMessage(content='This is a test', role='assistant', function_call=
|
||||||
|
|
||||||
|
|
||||||
class Message(OpenAIObject):
|
class Message(OpenAIObject):
|
||||||
|
|
||||||
content: Optional[str]
|
content: Optional[str]
|
||||||
role: Literal["assistant", "user", "system", "tool", "function"]
|
role: Literal["assistant", "user", "system", "tool", "function"]
|
||||||
tool_calls: Optional[List[ChatCompletionMessageToolCall]]
|
tool_calls: Optional[List[ChatCompletionMessageToolCall]]
|
||||||
function_call: Optional[FunctionCall]
|
function_call: Optional[FunctionCall]
|
||||||
|
audio: Optional[ChatCompletionAudioResponse] = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -340,9 +388,10 @@ class Message(OpenAIObject):
|
||||||
role: Literal["assistant"] = "assistant",
|
role: Literal["assistant"] = "assistant",
|
||||||
function_call=None,
|
function_call=None,
|
||||||
tool_calls: Optional[list] = None,
|
tool_calls: Optional[list] = None,
|
||||||
|
audio: Optional[ChatCompletionAudioResponse] = None,
|
||||||
**params,
|
**params,
|
||||||
):
|
):
|
||||||
init_values = {
|
init_values: Dict[str, Any] = {
|
||||||
"content": content,
|
"content": content,
|
||||||
"role": role or "assistant", # handle null input
|
"role": role or "assistant", # handle null input
|
||||||
"function_call": (
|
"function_call": (
|
||||||
|
@ -361,11 +410,20 @@ class Message(OpenAIObject):
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if audio is not None:
|
||||||
|
init_values["audio"] = audio
|
||||||
|
|
||||||
super(Message, self).__init__(
|
super(Message, self).__init__(
|
||||||
**init_values, # type: ignore
|
**init_values, # type: ignore
|
||||||
**params,
|
**params,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if audio is None:
|
||||||
|
# delete audio from self
|
||||||
|
# OpenAI compatible APIs like mistral API will raise an error if audio is passed in
|
||||||
|
del self.audio
|
||||||
|
|
||||||
def get(self, key, default=None):
|
def get(self, key, default=None):
|
||||||
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
||||||
return getattr(self, key, default)
|
return getattr(self, key, default)
|
||||||
|
|
|
@ -2483,6 +2483,8 @@ def get_optional_params( # noqa: PLR0915
|
||||||
stop=None,
|
stop=None,
|
||||||
max_tokens=None,
|
max_tokens=None,
|
||||||
max_completion_tokens=None,
|
max_completion_tokens=None,
|
||||||
|
modalities=None,
|
||||||
|
audio=None,
|
||||||
presence_penalty=None,
|
presence_penalty=None,
|
||||||
frequency_penalty=None,
|
frequency_penalty=None,
|
||||||
logit_bias=None,
|
logit_bias=None,
|
||||||
|
@ -2562,6 +2564,8 @@ def get_optional_params( # noqa: PLR0915
|
||||||
"stop": None,
|
"stop": None,
|
||||||
"max_tokens": None,
|
"max_tokens": None,
|
||||||
"max_completion_tokens": None,
|
"max_completion_tokens": None,
|
||||||
|
"modalities": None,
|
||||||
|
"audio": None,
|
||||||
"presence_penalty": None,
|
"presence_penalty": None,
|
||||||
"frequency_penalty": None,
|
"frequency_penalty": None,
|
||||||
"logit_bias": None,
|
"logit_bias": None,
|
||||||
|
@ -5734,6 +5738,7 @@ def convert_to_model_response_object( # noqa: PLR0915
|
||||||
role=choice["message"]["role"] or "assistant",
|
role=choice["message"]["role"] or "assistant",
|
||||||
function_call=choice["message"].get("function_call", None),
|
function_call=choice["message"].get("function_call", None),
|
||||||
tool_calls=tool_calls,
|
tool_calls=tool_calls,
|
||||||
|
audio=choice["message"].get("audio", None),
|
||||||
)
|
)
|
||||||
finish_reason = choice.get("finish_reason", None)
|
finish_reason = choice.get("finish_reason", None)
|
||||||
if finish_reason is None:
|
if finish_reason is None:
|
||||||
|
|
8
poetry.lock
generated
8
poetry.lock
generated
|
@ -1823,13 +1823,13 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openai"
|
name = "openai"
|
||||||
version = "1.51.0"
|
version = "1.52.0"
|
||||||
description = "The official Python library for the openai API"
|
description = "The official Python library for the openai API"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.1"
|
python-versions = ">=3.7.1"
|
||||||
files = [
|
files = [
|
||||||
{file = "openai-1.51.0-py3-none-any.whl", hash = "sha256:d9affafb7e51e5a27dce78589d4964ce4d6f6d560307265933a94b2e3f3c5d2c"},
|
{file = "openai-1.52.0-py3-none-any.whl", hash = "sha256:0c249f20920183b0a2ca4f7dba7b0452df3ecd0fa7985eb1d91ad884bc3ced9c"},
|
||||||
{file = "openai-1.51.0.tar.gz", hash = "sha256:8dc4f9d75ccdd5466fc8c99a952186eddceb9fd6ba694044773f3736a847149d"},
|
{file = "openai-1.52.0.tar.gz", hash = "sha256:95c65a5f77559641ab8f3e4c3a050804f7b51d278870e2ec1f7444080bfe565a"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
|
@ -3519,4 +3519,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "cryptography", "fastapi", "fastapi-
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0, !=3.9.7"
|
python-versions = ">=3.8.1,<4.0, !=3.9.7"
|
||||||
content-hash = "94beed60a176d854a59b7cf9cace6f7de83ae6036cbcdb8ed10273df5b299afa"
|
content-hash = "491d361cabc637f8f896091b92855040da670bb7b311dcbfe75ad20eab97400c"
|
||||||
|
|
|
@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.8.1,<4.0, !=3.9.7"
|
python = ">=3.8.1,<4.0, !=3.9.7"
|
||||||
openai = ">=1.51.0"
|
openai = ">=1.52.0"
|
||||||
python-dotenv = ">=0.2.0"
|
python-dotenv = ">=0.2.0"
|
||||||
tiktoken = ">=0.7.0"
|
tiktoken = ">=0.7.0"
|
||||||
importlib-metadata = ">=6.8.0"
|
importlib-metadata = ">=6.8.0"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# LITELLM PROXY DEPENDENCIES #
|
# LITELLM PROXY DEPENDENCIES #
|
||||||
anyio==4.4.0 # openai + http req.
|
anyio==4.4.0 # openai + http req.
|
||||||
openai==1.51.0 # openai req.
|
openai==1.52.0 # openai req.
|
||||||
fastapi==0.111.0 # server dep
|
fastapi==0.111.0 # server dep
|
||||||
backoff==2.2.1 # server dep
|
backoff==2.2.1 # server dep
|
||||||
pyyaml==6.0.0 # server dep
|
pyyaml==6.0.0 # server dep
|
||||||
|
|
BIN
tests/llm_translation/dog.wav
Normal file
BIN
tests/llm_translation/dog.wav
Normal file
Binary file not shown.
76
tests/llm_translation/test_gpt4o_audio.py
Normal file
76
tests/llm_translation/test_gpt4o_audio.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import AsyncMock
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
from respx import MockRouter
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm import Choices, Message, ModelResponse
|
||||||
|
import base64
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.flaky(retries=3, delay=1)
|
||||||
|
async def test_audio_output_from_model():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
completion = await litellm.acompletion(
|
||||||
|
model="gpt-4o-audio-preview",
|
||||||
|
modalities=["text", "audio"],
|
||||||
|
audio={"voice": "alloy", "format": "wav"},
|
||||||
|
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
print("response= ", completion)
|
||||||
|
|
||||||
|
print(completion.choices[0])
|
||||||
|
|
||||||
|
assert completion.choices[0].message.audio is not None
|
||||||
|
assert isinstance(
|
||||||
|
completion.choices[0].message.audio,
|
||||||
|
litellm.types.utils.ChatCompletionAudioResponse,
|
||||||
|
)
|
||||||
|
assert len(completion.choices[0].message.audio.data) > 0
|
||||||
|
|
||||||
|
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
|
||||||
|
with open("dog.wav", "wb") as f:
|
||||||
|
f.write(wav_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_audio_input_to_model():
|
||||||
|
# Fetch the audio file and convert it to a base64 encoded string
|
||||||
|
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
wav_data = response.content
|
||||||
|
encoded_string = base64.b64encode(wav_data).decode("utf-8")
|
||||||
|
|
||||||
|
completion = await litellm.acompletion(
|
||||||
|
model="gpt-4o-audio-preview",
|
||||||
|
modalities=["text", "audio"],
|
||||||
|
audio={"voice": "alloy", "format": "wav"},
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What is in this recording?"},
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {"data": encoded_string, "format": "wav"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message)
|
Loading…
Add table
Add a link
Reference in a new issue