Add OpenAI gpt-4o-transcribe support (#9517)

* refactor: introduce new transformation config for gpt-4o-transcribe models * refactor: expose new transformation configs for audio transcription * ci: fix config yml * feat(openai/transcriptions): support provider config transformation on openai audio transcriptions allows gpt-4o and whisper audio transformation to work as expected * refactor: migrate fireworks ai + deepgram to new transform request pattern * feat(openai/): working support for gpt-4o-audio-transcribe * build(model_prices_and_context_window.json): add gpt-4o-transcribe to model cost map * build(model_prices_and_context_window.json): specify what endpoints are supported for `/audio/transcriptions` * fix(get_supported_openai_params.py): fix return * refactor(deepgram/): migrate unit test to deepgram handler * refactor: cleanup unused imports * fix(get_supported_openai_params.py): fix linting error * test: update test
2025-04-27 11:43:54 +00:00 · 2025-03-26 23:10:25 -07:00 · 2025-03-26 23:10:25 -07:00 · d58fe5a9f9
commit d58fe5a9f9
parent f2df53771c
20 changed files with 402 additions and 92 deletions
--- a/litellm/llms/openai/transcriptions/handler.py
+++ b/litellm/llms/openai/transcriptions/handler.py
@ -7,6 +7,9 @@ from pydantic import BaseModel
 import litellm
 from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_name
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.llms.base_llm.audio_transcription.transformation import (
+    BaseAudioTranscriptionConfig,
+)
 from litellm.types.utils import FileTypes
 from litellm.utils import (
    TranscriptionResponse,
@ -75,6 +78,7 @@ class OpenAIAudioTranscription(OpenAIChatCompletion):
        model: str,
        audio_file: FileTypes,
        optional_params: dict,
+        litellm_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
@ -83,16 +87,24 @@ class OpenAIAudioTranscription(OpenAIChatCompletion):
        api_base: Optional[str],
        client=None,
        atranscription: bool = False,
+        provider_config: Optional[BaseAudioTranscriptionConfig] = None,
    ) -> TranscriptionResponse:
-        data = {"model": model, "file": audio_file, **optional_params}
-
-        if "response_format" not in data or (
-            data["response_format"] == "text" or data["response_format"] == "json"
-        ):
-            data["response_format"] = (
-                "verbose_json"  # ensures 'duration' is received - used for cost calculation
+        """
+        Handle audio transcription request
+        """
+        if provider_config is not None:
+            data = provider_config.transform_audio_transcription_request(
+                model=model,
+                audio_file=audio_file,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
            )

+            if isinstance(data, bytes):
+                raise ValueError("OpenAI transformation route requires a dict")
+        else:
+            data = {"model": model, "file": audio_file, **optional_params}
+
        if atranscription is True:
            return self.async_audio_transcriptions(  # type: ignore
                audio_file=audio_file,