Add OpenAI gpt-4o-transcribe support (#9517)

* refactor: introduce new transformation config for gpt-4o-transcribe models * refactor: expose new transformation configs for audio transcription * ci: fix config yml * feat(openai/transcriptions): support provider config transformation on openai audio transcriptions allows gpt-4o and whisper audio transformation to work as expected * refactor: migrate fireworks ai + deepgram to new transform request pattern * feat(openai/): working support for gpt-4o-audio-transcribe * build(model_prices_and_context_window.json): add gpt-4o-transcribe to model cost map * build(model_prices_and_context_window.json): specify what endpoints are supported for `/audio/transcriptions` * fix(get_supported_openai_params.py): fix return * refactor(deepgram/): migrate unit test to deepgram handler * refactor: cleanup unused imports * fix(get_supported_openai_params.py): fix linting error * test: update test
2025-04-26 03:04:13 +00:00 · 2025-03-26 23:10:25 -07:00 · 2025-03-26 23:10:25 -07:00 · c0845fec1f
commit c0845fec1f
parent 109add7946
20 changed files with 402 additions and 92 deletions
--- a/litellm/llms/custom_httpx/llm_http_handler.py
+++ b/litellm/llms/custom_httpx/llm_http_handler.py
@ -1,4 +1,3 @@
-import io
 import json
 from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union

@ -8,6 +7,9 @@ import litellm
 import litellm.litellm_core_utils
 import litellm.types
 import litellm.types.utils
+from litellm.llms.base_llm.audio_transcription.transformation import (
+    BaseAudioTranscriptionConfig,
+)
 from litellm.llms.base_llm.chat.transformation import BaseConfig
 from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
 from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
@ -852,54 +854,12 @@ class BaseLLMHTTPHandler:
            request_data=request_data,
        )

-    def handle_audio_file(self, audio_file: FileTypes) -> bytes:
-        """
-        Processes the audio file input based on its type and returns the binary data.
-
-        Args:
-            audio_file: Can be a file path (str), a tuple (filename, file_content), or binary data (bytes).
-
-        Returns:
-            The binary data of the audio file.
-        """
-        binary_data: bytes  # Explicitly declare the type
-
-        # Handle the audio file based on type
-        if isinstance(audio_file, str):
-            # If it's a file path
-            with open(audio_file, "rb") as f:
-                binary_data = f.read()  # `f.read()` always returns `bytes`
-        elif isinstance(audio_file, tuple):
-            # Handle tuple case
-            _, file_content = audio_file[:2]
-            if isinstance(file_content, str):
-                with open(file_content, "rb") as f:
-                    binary_data = f.read()  # `f.read()` always returns `bytes`
-            elif isinstance(file_content, bytes):
-                binary_data = file_content
-            else:
-                raise TypeError(
-                    f"Unexpected type in tuple: {type(file_content)}. Expected str or bytes."
-                )
-        elif isinstance(audio_file, bytes):
-            # Assume it's already binary data
-            binary_data = audio_file
-        elif isinstance(audio_file, io.BufferedReader) or isinstance(
-            audio_file, io.BytesIO
-        ):
-            # Handle file-like objects
-            binary_data = audio_file.read()
-
-        else:
-            raise TypeError(f"Unsupported type for audio_file: {type(audio_file)}")
-
-        return binary_data
-
    def audio_transcriptions(
        self,
        model: str,
        audio_file: FileTypes,
        optional_params: dict,
+        litellm_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
@ -910,11 +870,8 @@ class BaseLLMHTTPHandler:
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
        atranscription: bool = False,
        headers: dict = {},
-        litellm_params: dict = {},
+        provider_config: Optional[BaseAudioTranscriptionConfig] = None,
    ) -> TranscriptionResponse:
-        provider_config = ProviderConfigManager.get_provider_audio_transcription_config(
-            model=model, provider=litellm.LlmProviders(custom_llm_provider)
-        )
        if provider_config is None:
            raise ValueError(
                f"No provider config found for model: {model} and provider: {custom_llm_provider}"
@ -938,7 +895,18 @@ class BaseLLMHTTPHandler:
        )

        # Handle the audio file based on type
-        binary_data = self.handle_audio_file(audio_file)
+        data = provider_config.transform_audio_transcription_request(
+            model=model,
+            audio_file=audio_file,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+        )
+        binary_data: Optional[bytes] = None
+        json_data: Optional[dict] = None
+        if isinstance(data, bytes):
+            binary_data = data
+        else:
+            json_data = data

        try:
            # Make the POST request
@ -946,6 +914,7 @@ class BaseLLMHTTPHandler:
                url=complete_url,
                headers=headers,
                content=binary_data,
+                json=json_data,
                timeout=timeout,
            )
        except Exception as e: