Add OpenAI gpt-4o-transcribe support (#9517)

* refactor: introduce new transformation config for gpt-4o-transcribe models

* refactor: expose new transformation configs for audio transcription

* ci: fix config yml

* feat(openai/transcriptions): support provider config transformation on openai audio transcriptions

allows gpt-4o and whisper audio transformation to work as expected

* refactor: migrate fireworks ai + deepgram to new transform request pattern

* feat(openai/): working support for gpt-4o-audio-transcribe

* build(model_prices_and_context_window.json): add gpt-4o-transcribe to model cost map

* build(model_prices_and_context_window.json): specify what endpoints are supported for `/audio/transcriptions`

* fix(get_supported_openai_params.py): fix return

* refactor(deepgram/): migrate unit test to deepgram handler

* refactor: cleanup unused imports

* fix(get_supported_openai_params.py): fix linting error

* test: update test
This commit is contained in:
Krish Dholakia 2025-03-26 23:10:25 -07:00 committed by GitHub
parent 109add7946
commit c0845fec1f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 402 additions and 92 deletions

View file

@ -1,4 +1,3 @@
import io
import json
from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union
@ -8,6 +7,9 @@ import litellm
import litellm.litellm_core_utils
import litellm.types
import litellm.types.utils
from litellm.llms.base_llm.audio_transcription.transformation import (
BaseAudioTranscriptionConfig,
)
from litellm.llms.base_llm.chat.transformation import BaseConfig
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
@ -852,54 +854,12 @@ class BaseLLMHTTPHandler:
request_data=request_data,
)
def handle_audio_file(self, audio_file: FileTypes) -> bytes:
"""
Processes the audio file input based on its type and returns the binary data.
Args:
audio_file: Can be a file path (str), a tuple (filename, file_content), or binary data (bytes).
Returns:
The binary data of the audio file.
"""
binary_data: bytes # Explicitly declare the type
# Handle the audio file based on type
if isinstance(audio_file, str):
# If it's a file path
with open(audio_file, "rb") as f:
binary_data = f.read() # `f.read()` always returns `bytes`
elif isinstance(audio_file, tuple):
# Handle tuple case
_, file_content = audio_file[:2]
if isinstance(file_content, str):
with open(file_content, "rb") as f:
binary_data = f.read() # `f.read()` always returns `bytes`
elif isinstance(file_content, bytes):
binary_data = file_content
else:
raise TypeError(
f"Unexpected type in tuple: {type(file_content)}. Expected str or bytes."
)
elif isinstance(audio_file, bytes):
# Assume it's already binary data
binary_data = audio_file
elif isinstance(audio_file, io.BufferedReader) or isinstance(
audio_file, io.BytesIO
):
# Handle file-like objects
binary_data = audio_file.read()
else:
raise TypeError(f"Unsupported type for audio_file: {type(audio_file)}")
return binary_data
def audio_transcriptions(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
model_response: TranscriptionResponse,
timeout: float,
max_retries: int,
@ -910,11 +870,8 @@ class BaseLLMHTTPHandler:
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
atranscription: bool = False,
headers: dict = {},
litellm_params: dict = {},
provider_config: Optional[BaseAudioTranscriptionConfig] = None,
) -> TranscriptionResponse:
provider_config = ProviderConfigManager.get_provider_audio_transcription_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
)
if provider_config is None:
raise ValueError(
f"No provider config found for model: {model} and provider: {custom_llm_provider}"
@ -938,7 +895,18 @@ class BaseLLMHTTPHandler:
)
# Handle the audio file based on type
binary_data = self.handle_audio_file(audio_file)
data = provider_config.transform_audio_transcription_request(
model=model,
audio_file=audio_file,
optional_params=optional_params,
litellm_params=litellm_params,
)
binary_data: Optional[bytes] = None
json_data: Optional[dict] = None
if isinstance(data, bytes):
binary_data = data
else:
json_data = data
try:
# Make the POST request
@ -946,6 +914,7 @@ class BaseLLMHTTPHandler:
url=complete_url,
headers=headers,
content=binary_data,
json=json_data,
timeout=timeout,
)
except Exception as e: