litellm-mirror/litellm/llms/openai/transcriptions/handler.py
Krish Dholakia c0845fec1f
Add OpenAI gpt-4o-transcribe support (#9517)
* refactor: introduce new transformation config for gpt-4o-transcribe models

* refactor: expose new transformation configs for audio transcription

* ci: fix config yml

* feat(openai/transcriptions): support provider config transformation on openai audio transcriptions

allows gpt-4o and whisper audio transformation to work as expected

* refactor: migrate fireworks ai + deepgram to new transform request pattern

* feat(openai/): working support for gpt-4o-audio-transcribe

* build(model_prices_and_context_window.json): add gpt-4o-transcribe to model cost map

* build(model_prices_and_context_window.json): specify what endpoints are supported for `/audio/transcriptions`

* fix(get_supported_openai_params.py): fix return

* refactor(deepgram/): migrate unit test to deepgram handler

* refactor: cleanup unused imports

* fix(get_supported_openai_params.py): fix linting error

* test: update test
2025-03-26 23:10:25 -07:00

222 lines
7.9 KiB
Python

from typing import Optional, Union
import httpx
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel
import litellm
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_name
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.audio_transcription.transformation import (
BaseAudioTranscriptionConfig,
)
from litellm.types.utils import FileTypes
from litellm.utils import (
TranscriptionResponse,
convert_to_model_response_object,
extract_duration_from_srt_or_vtt,
)
from ..openai import OpenAIChatCompletion
class OpenAIAudioTranscription(OpenAIChatCompletion):
# Audio Transcriptions
async def make_openai_audio_transcriptions_request(
self,
openai_aclient: AsyncOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
- call openai_aclient.audio.transcriptions.create by default
"""
try:
raw_response = (
await openai_aclient.audio.transcriptions.with_raw_response.create(
**data, timeout=timeout
)
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
except Exception as e:
raise e
def make_sync_openai_audio_transcriptions_request(
self,
openai_client: OpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
- call openai_aclient.audio.transcriptions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
openai_client.audio.transcriptions.with_raw_response.create(
**data, timeout=timeout
)
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
return None, response
except Exception as e:
raise e
def audio_transcriptions(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
model_response: TranscriptionResponse,
timeout: float,
max_retries: int,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str],
api_base: Optional[str],
client=None,
atranscription: bool = False,
provider_config: Optional[BaseAudioTranscriptionConfig] = None,
) -> TranscriptionResponse:
"""
Handle audio transcription request
"""
if provider_config is not None:
data = provider_config.transform_audio_transcription_request(
model=model,
audio_file=audio_file,
optional_params=optional_params,
litellm_params=litellm_params,
)
if isinstance(data, bytes):
raise ValueError("OpenAI transformation route requires a dict")
else:
data = {"model": model, "file": audio_file, **optional_params}
if atranscription is True:
return self.async_audio_transcriptions( # type: ignore
audio_file=audio_file,
data=data,
model_response=model_response,
timeout=timeout,
api_key=api_key,
api_base=api_base,
client=client,
max_retries=max_retries,
logging_obj=logging_obj,
)
openai_client: OpenAI = self._get_openai_client( # type: ignore
is_async=False,
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
client=client,
)
## LOGGING
logging_obj.pre_call(
input=None,
api_key=openai_client.api_key,
additional_args={
"api_base": openai_client._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
_, response = self.make_sync_openai_audio_transcriptions_request(
openai_client=openai_client,
data=data,
timeout=timeout,
)
if isinstance(response, BaseModel):
stringified_response = response.model_dump()
else:
stringified_response = TranscriptionResponse(text=response).model_dump()
## LOGGING
logging_obj.post_call(
input=get_audio_file_name(audio_file),
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
final_response: TranscriptionResponse = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
return final_response
async def async_audio_transcriptions(
self,
audio_file: FileTypes,
data: dict,
model_response: TranscriptionResponse,
timeout: float,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
max_retries=None,
):
try:
openai_aclient: AsyncOpenAI = self._get_openai_client( # type: ignore
is_async=True,
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
client=client,
)
## LOGGING
logging_obj.pre_call(
input=None,
api_key=openai_aclient.api_key,
additional_args={
"api_base": openai_aclient._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
headers, response = await self.make_openai_audio_transcriptions_request(
openai_aclient=openai_aclient,
data=data,
timeout=timeout,
)
logging_obj.model_call_details["response_headers"] = headers
if isinstance(response, BaseModel):
stringified_response = response.model_dump()
else:
duration = extract_duration_from_srt_or_vtt(response)
stringified_response = TranscriptionResponse(text=response).model_dump()
stringified_response["duration"] = duration
## LOGGING
logging_obj.post_call(
input=get_audio_file_name(audio_file),
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
except Exception as e:
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
original_response=str(e),
)
raise e