Litellm dev 12 28 2024 p3 (#7464)

* feat(deepgram/): initial e2e support for deepgram stt Uses deepgram's `/listen` endpoint to transcribe speech to text Closes https://github.com/BerriAI/litellm/issues/4875 * fix: fix linting errors * test: fix test
2025-04-25 18:54:30 +00:00 · 2024-12-28 19:18:58 -08:00 · 2024-12-28 19:18:58 -08:00 · ebc28b1921
commit ebc28b1921
parent 480d838790
10 changed files with 303 additions and 5 deletions
--- a/litellm/llms/custom_httpx/llm_http_handler.py
+++ b/litellm/llms/custom_httpx/llm_http_handler.py
@ -1,3 +1,4 @@
+import io
 import json
 from typing import TYPE_CHECKING, Any, Optional, Tuple, Union

@ -17,7 +18,7 @@ from litellm.llms.custom_httpx.http_handler import (
    get_async_httpx_client,
 )
 from litellm.types.rerank import OptionalRerankParams, RerankResponse
-from litellm.types.utils import EmbeddingResponse
+from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
 from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager

 if TYPE_CHECKING:
@ -667,6 +668,115 @@ class BaseLLMHTTPHandler:
            request_data=request_data,
        )

+    def handle_audio_file(self, audio_file: FileTypes) -> bytes:
+        """
+        Processes the audio file input based on its type and returns the binary data.
+
+        Args:
+            audio_file: Can be a file path (str), a tuple (filename, file_content), or binary data (bytes).
+
+        Returns:
+            The binary data of the audio file.
+        """
+        binary_data: bytes  # Explicitly declare the type
+
+        # Handle the audio file based on type
+        if isinstance(audio_file, str):
+            # If it's a file path
+            with open(audio_file, "rb") as f:
+                binary_data = f.read()  # `f.read()` always returns `bytes`
+        elif isinstance(audio_file, tuple):
+            # Handle tuple case
+            _, file_content = audio_file[:2]
+            if isinstance(file_content, str):
+                with open(file_content, "rb") as f:
+                    binary_data = f.read()  # `f.read()` always returns `bytes`
+            elif isinstance(file_content, bytes):
+                binary_data = file_content
+            else:
+                raise TypeError(
+                    f"Unexpected type in tuple: {type(file_content)}. Expected str or bytes."
+                )
+        elif isinstance(audio_file, bytes):
+            # Assume it's already binary data
+            binary_data = audio_file
+        elif isinstance(audio_file, io.BufferedReader):
+            # Handle file-like objects
+            binary_data = audio_file.read()
+
+        else:
+            raise TypeError(f"Unsupported type for audio_file: {type(audio_file)}")
+
+        return binary_data
+
+    def audio_transcriptions(
+        self,
+        model: str,
+        audio_file: FileTypes,
+        optional_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        max_retries: int,
+        logging_obj: LiteLLMLoggingObj,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        custom_llm_provider: str,
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+        atranscription: bool = False,
+        headers: dict = {},
+    ) -> TranscriptionResponse:
+        provider_config = ProviderConfigManager.get_provider_audio_transcription_config(
+            model=model, provider=litellm.LlmProviders(custom_llm_provider)
+        )
+        if provider_config is None:
+            raise ValueError(
+                f"No provider config found for model: {model} and provider: {custom_llm_provider}"
+            )
+        headers = provider_config.validate_environment(
+            api_key=api_key,
+            headers=headers,
+            model=model,
+            messages=[],
+            optional_params=optional_params,
+        )
+
+        if client is None or not isinstance(client, HTTPHandler):
+            client = _get_httpx_client()
+
+        complete_url = provider_config.get_complete_url(
+            api_base=api_base,
+            model=model,
+            optional_params=optional_params,
+        )
+
+        # Handle the audio file based on type
+        binary_data = self.handle_audio_file(audio_file)
+
+        try:
+            # Make the POST request
+            response = client.post(
+                url=complete_url,
+                headers=headers,
+                content=binary_data,
+                timeout=timeout,
+            )
+        except Exception as e:
+            raise self._handle_error(e=e, provider_config=provider_config)
+
+        if isinstance(provider_config, litellm.DeepgramAudioTranscriptionConfig):
+            returned_response = provider_config.transform_audio_transcription_response(
+                model=model,
+                raw_response=response,
+                model_response=model_response,
+                logging_obj=logging_obj,
+                request_data={},
+                optional_params=optional_params,
+                litellm_params={},
+                api_key=api_key,
+            )
+            return returned_response
+        return model_response
+
    def _handle_error(
        self, e: Exception, provider_config: Union[BaseConfig, BaseRerankConfig]
    ):