use correct type hints for audio transcriptions

2024-09-05 09:12:27 -07:00 · 2024-09-05 09:12:27 -07:00 · 81ee1653af
commit 81ee1653af
parent a9be7dd79b
13 changed files with 420 additions and 430 deletions
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -20,8 +20,8 @@ import httpx
 import litellm
 from litellm import client
-from litellm.llms.azure import AzureBatchesAPI
+from litellm.llms.AzureOpenAI.azure import AzureBatchesAPI
-from litellm.llms.openai import OpenAIBatchesAPI
+from litellm.llms.OpenAI.openai import OpenAIBatchesAPI
 from litellm.secret_managers.main import get_secret
 from litellm.types.llms.openai import (
    Batch,
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -17,7 +17,7 @@ import time
 import traceback
 from datetime import timedelta
 from enum import Enum
-from typing import Any, BinaryIO, List, Literal, Optional, Union
+from typing import Any, List, Literal, Optional, Union
 from openai._models import BaseModel as OpenAIObject
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -1,9 +1,10 @@
 # What is this?
 ## Helper utilities
 import os
-from typing import BinaryIO, List, Literal, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 from litellm._logging import verbose_logger
 from litellm.types.utils import FileTypes
 def map_finish_reason(
@ -88,18 +89,19 @@ def _get_parent_otel_span_from_kwargs(kwargs: Optional[dict] = None):
        return None
-def get_file_check_sum(_file: BinaryIO):
+def get_audio_file_name(file_obj: FileTypes) -> str:
    """
-    Helper to safely get file checksum - used as a cache key
+    Safely get the name of a file-like object or return its string representation.
    Args:
        file_obj (Any): A file-like object or any other object.
    Returns:
        str: The name of the file if available, otherwise a string representation of the object.
    """
-    try:
+    if hasattr(file_obj, "name"):
-        file_descriptor = _file.fileno()
+        return getattr(file_obj, "name")
-        file_stat = os.fstat(file_descriptor)
+    elif hasattr(file_obj, "__str__"):
-        file_size = str(file_stat.st_size)
+        return str(file_obj)
-        file_checksum = _file.name + file_size
+    else:
-        return file_checksum
+        return repr(file_obj)
    except Exception as e:
        verbose_logger.error(f"Error getting file_checksum: {(str(e))}")
        file_checksum = _file.name
        return file_checksum
    return file_checksum
--- a/litellm/llms/AzureOpenAI/audio_transcriptions.py
+++ b/litellm/llms/AzureOpenAI/audio_transcriptions.py
@ -0,0 +1,192 @@
 import uuid
 from typing import Optional, Union
 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
 from pydantic import BaseModel
 import litellm
 from litellm.litellm_core_utils.core_helpers import get_audio_file_name
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.utils import FileTypes
 from litellm.utils import TranscriptionResponse, convert_to_model_response_object
 from .azure import (
    AzureChatCompletion,
    get_azure_ad_token_from_oidc,
    select_azure_base_url_or_endpoint,
 )
 class AzureAudioTranscription(AzureChatCompletion):
    def audio_transcriptions(
        self,
        model: str,
        audio_file: FileTypes,
        optional_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        client=None,
        azure_ad_token: Optional[str] = None,
        logging_obj=None,
        atranscription: bool = False,
    ):
        data = {"model": model, "file": audio_file, **optional_params}
        # init AzureOpenAI Client
        azure_client_params = {
            "api_version": api_version,
            "azure_endpoint": api_base,
            "azure_deployment": model,
            "timeout": timeout,
        }
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        if api_key is not None:
            azure_client_params["api_key"] = api_key
        elif azure_ad_token is not None:
            if azure_ad_token.startswith("oidc/"):
                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
            azure_client_params["azure_ad_token"] = azure_ad_token
        if max_retries is not None:
            azure_client_params["max_retries"] = max_retries
        if atranscription is True:
            return self.async_audio_transcriptions(
                audio_file=audio_file,
                data=data,
                model_response=model_response,
                timeout=timeout,
                api_key=api_key,
                api_base=api_base,
                client=client,
                azure_client_params=azure_client_params,
                max_retries=max_retries,
                logging_obj=logging_obj,
            )
        if client is None:
            azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params)  # type: ignore
        else:
            azure_client = client
        ## LOGGING
        logging_obj.pre_call(
            input=f"audio_file_{uuid.uuid4()}",
            api_key=azure_client.api_key,
            additional_args={
                "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
                "api_base": azure_client._base_url._uri_reference,
                "atranscription": True,
                "complete_input_dict": data,
            },
        )
        response = azure_client.audio.transcriptions.create(
            **data, timeout=timeout  # type: ignore
        )
        if isinstance(response, BaseModel):
            stringified_response = response.model_dump()
        else:
            stringified_response = TranscriptionResponse(text=response).model_dump()
        ## LOGGING
        logging_obj.post_call(
            input=get_audio_file_name(audio_file),
            api_key=api_key,
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
        hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
        self,
        audio_file: FileTypes,
        data: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        client=None,
        azure_client_params=None,
        max_retries=None,
        logging_obj=None,
    ):
        response = None
        try:
            if client is None:
                async_azure_client = AsyncAzureOpenAI(
                    **azure_client_params,
                    http_client=litellm.aclient_session,
                )
            else:
                async_azure_client = client
            ## LOGGING
            logging_obj.pre_call(
                input=f"audio_file_{uuid.uuid4()}",
                api_key=async_azure_client.api_key,
                additional_args={
                    "headers": {
                        "Authorization": f"Bearer {async_azure_client.api_key}"
                    },
                    "api_base": async_azure_client._base_url._uri_reference,
                    "atranscription": True,
                    "complete_input_dict": data,
                },
            )
            raw_response = (
                await async_azure_client.audio.transcriptions.with_raw_response.create(
                    **data, timeout=timeout
                )
            )  # type: ignore
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            if isinstance(response, BaseModel):
                stringified_response = response.model_dump()
            else:
                stringified_response = TranscriptionResponse(text=response).model_dump()
            ## LOGGING
            logging_obj.post_call(
                input=get_audio_file_name(audio_file),
                api_key=api_key,
                additional_args={
                    "headers": {
                        "Authorization": f"Bearer {async_azure_client.api_key}"
                    },
                    "api_base": async_azure_client._base_url._uri_reference,
                    "atranscription": True,
                    "complete_input_dict": data,
                },
                original_response=stringified_response,
            )
            hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
            response = convert_to_model_response_object(
                _response_headers=headers,
                response_object=stringified_response,
                model_response_object=model_response,
                hidden_params=hidden_params,
                response_type="audio_transcription",
            )  # type: ignore
            return response
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=input,
                api_key=api_key,
                original_response=str(e),
            )
            raise e
--- a/litellm/llms/AzureOpenAI/azure.py
+++ b/litellm/llms/AzureOpenAI/azure.py
@ -4,17 +4,7 @@ import os
 import time
 import types
 import uuid
-from typing import (
+from typing import Any, Callable, Coroutine, Iterable, List, Literal, Optional, Union
    Any,
    BinaryIO,
    Callable,
    Coroutine,
    Iterable,
    List,
    Literal,
    Optional,
    Union,
 )
 import httpx  # type: ignore
 import requests
@ -27,6 +17,7 @@ from litellm import ImageResponse, OpenAIConfig
 from litellm.caching import DualCache
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.utils import FileTypes
 from litellm.utils import (
    Choices,
    CustomStreamWrapper,
@ -39,7 +30,7 @@ from litellm.utils import (
    modify_url,
 )
-from ..types.llms.openai import (
+from ...types.llms.openai import (
    Assistant,
    AssistantEventHandler,
    AssistantStreamManager,
@ -63,7 +54,7 @@ from ..types.llms.openai import (
    SyncCursorPage,
    Thread,
 )
-from .base import BaseLLM
+from ..base import BaseLLM
 azure_ad_cache = DualCache()
@ -1570,178 +1561,6 @@ class AzureChatCompletion(BaseLLM):
            else:
                raise AzureOpenAIError(status_code=500, message=str(e))
    def audio_transcriptions(
        self,
        model: str,
        audio_file: BinaryIO,
        optional_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        client=None,
        azure_ad_token: Optional[str] = None,
        logging_obj=None,
        atranscription: bool = False,
    ):
        data = {"model": model, "file": audio_file, **optional_params}
        # init AzureOpenAI Client
        azure_client_params = {
            "api_version": api_version,
            "azure_endpoint": api_base,
            "azure_deployment": model,
            "timeout": timeout,
        }
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        if api_key is not None:
            azure_client_params["api_key"] = api_key
        elif azure_ad_token is not None:
            if azure_ad_token.startswith("oidc/"):
                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
            azure_client_params["azure_ad_token"] = azure_ad_token
        if max_retries is not None:
            azure_client_params["max_retries"] = max_retries
        if atranscription is True:
            return self.async_audio_transcriptions(
                audio_file=audio_file,
                data=data,
                model_response=model_response,
                timeout=timeout,
                api_key=api_key,
                api_base=api_base,
                client=client,
                azure_client_params=azure_client_params,
                max_retries=max_retries,
                logging_obj=logging_obj,
            )
        if client is None:
            azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params)  # type: ignore
        else:
            azure_client = client
        ## LOGGING
        logging_obj.pre_call(
            input=f"audio_file_{uuid.uuid4()}",
            api_key=azure_client.api_key,
            additional_args={
                "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
                "api_base": azure_client._base_url._uri_reference,
                "atranscription": True,
                "complete_input_dict": data,
            },
        )
        response = azure_client.audio.transcriptions.create(
            **data, timeout=timeout  # type: ignore
        )
        if isinstance(response, BaseModel):
            stringified_response = response.model_dump()
        else:
            stringified_response = TranscriptionResponse(text=response).model_dump()
        ## LOGGING
        logging_obj.post_call(
            input=audio_file.name,
            api_key=api_key,
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
        hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
        self,
        audio_file: BinaryIO,
        data: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        client=None,
        azure_client_params=None,
        max_retries=None,
        logging_obj=None,
    ):
        response = None
        try:
            if client is None:
                async_azure_client = AsyncAzureOpenAI(
                    **azure_client_params,
                    http_client=litellm.aclient_session,
                )
            else:
                async_azure_client = client
            ## LOGGING
            logging_obj.pre_call(
                input=f"audio_file_{uuid.uuid4()}",
                api_key=async_azure_client.api_key,
                additional_args={
                    "headers": {
                        "Authorization": f"Bearer {async_azure_client.api_key}"
                    },
                    "api_base": async_azure_client._base_url._uri_reference,
                    "atranscription": True,
                    "complete_input_dict": data,
                },
            )
            raw_response = (
                await async_azure_client.audio.transcriptions.with_raw_response.create(
                    **data, timeout=timeout
                )
            )  # type: ignore
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            if isinstance(response, BaseModel):
                stringified_response = response.model_dump()
            else:
                stringified_response = TranscriptionResponse(text=response).model_dump()
            ## LOGGING
            logging_obj.post_call(
                input=audio_file.name,
                api_key=api_key,
                additional_args={
                    "headers": {
                        "Authorization": f"Bearer {async_azure_client.api_key}"
                    },
                    "api_base": async_azure_client._base_url._uri_reference,
                    "atranscription": True,
                    "complete_input_dict": data,
                },
                original_response=stringified_response,
            )
            hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
            response = convert_to_model_response_object(
                _response_headers=headers,
                response_object=stringified_response,
                model_response_object=model_response,
                hidden_params=hidden_params,
                response_type="audio_transcription",
            )  # type: ignore
            return response
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=input,
                api_key=api_key,
                original_response=str(e),
            )
            raise e
    def audio_speech(
        self,
        model: str,
--- a/litellm/llms/OpenAI/audio_transcriptions.py
+++ b/litellm/llms/OpenAI/audio_transcriptions.py
@ -0,0 +1,177 @@
 from typing import Optional, Union
 import httpx
 from openai import AsyncOpenAI, OpenAI
 from pydantic import BaseModel
 import litellm
 from litellm.litellm_core_utils.core_helpers import get_audio_file_name
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.utils import FileTypes
 from litellm.utils import TranscriptionResponse, convert_to_model_response_object
 from .openai import OpenAIChatCompletion
 class OpenAIAudioTranscription(OpenAIChatCompletion):
    # Audio Transcriptions
    async def make_openai_audio_transcriptions_request(
        self,
        openai_aclient: AsyncOpenAI,
        data: dict,
        timeout: Union[float, httpx.Timeout],
    ):
        """
        Helper to:
        - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
        - call openai_aclient.audio.transcriptions.create by default
        """
        try:
            if litellm.return_response_headers is True:
                raw_response = (
                    await openai_aclient.audio.transcriptions.with_raw_response.create(
                        **data, timeout=timeout
                    )
                )  # type: ignore
                headers = dict(raw_response.headers)
                response = raw_response.parse()
                return headers, response
            else:
                response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
    def make_sync_openai_audio_transcriptions_request(
        self,
        openai_client: OpenAI,
        data: dict,
        timeout: Union[float, httpx.Timeout],
    ):
        """
        Helper to:
        - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
        - call openai_aclient.audio.transcriptions.create by default
        """
        try:
            if litellm.return_response_headers is True:
                raw_response = (
                    openai_client.audio.transcriptions.with_raw_response.create(
                        **data, timeout=timeout
                    )
                )  # type: ignore
                headers = dict(raw_response.headers)
                response = raw_response.parse()
                return headers, response
            else:
                response = openai_client.audio.transcriptions.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
    def audio_transcriptions(
        self,
        model: str,
        audio_file: FileTypes,
        optional_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
        api_key: Optional[str],
        api_base: Optional[str],
        client=None,
        logging_obj=None,
        atranscription: bool = False,
    ):
        data = {"model": model, "file": audio_file, **optional_params}
        if atranscription is True:
            return self.async_audio_transcriptions(
                audio_file=audio_file,
                data=data,
                model_response=model_response,
                timeout=timeout,
                api_key=api_key,
                api_base=api_base,
                client=client,
                max_retries=max_retries,
                logging_obj=logging_obj,
            )
        openai_client = self._get_openai_client(
            is_async=False,
            api_key=api_key,
            api_base=api_base,
            timeout=timeout,
            max_retries=max_retries,
        )
        _, response = self.make_sync_openai_audio_transcriptions_request(
            openai_client=openai_client,
            data=data,
            timeout=timeout,
        )
        if isinstance(response, BaseModel):
            stringified_response = response.model_dump()
        else:
            stringified_response = TranscriptionResponse(text=response).model_dump()
        ## LOGGING
        logging_obj.post_call(
            input=get_audio_file_name(audio_file),
            api_key=api_key,
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
        hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
        self,
        audio_file: FileTypes,
        data: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        client=None,
        max_retries=None,
    ):
        try:
            openai_aclient = self._get_openai_client(
                is_async=True,
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
            )
            headers, response = await self.make_openai_audio_transcriptions_request(
                openai_aclient=openai_aclient,
                data=data,
                timeout=timeout,
            )
            logging_obj.model_call_details["response_headers"] = headers
            if isinstance(response, BaseModel):
                stringified_response = response.model_dump()
            else:
                stringified_response = TranscriptionResponse(text=response).model_dump()
            ## LOGGING
            logging_obj.post_call(
                input=get_audio_file_name(audio_file),
                api_key=api_key,
                additional_args={"complete_input_dict": data},
                original_response=stringified_response,
            )
            hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
            return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=input,
                api_key=api_key,
                original_response=str(e),
            )
            raise e
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@ -4,16 +4,7 @@ import os
 import time
 import traceback
 import types
-from typing import (
+from typing import Any, Callable, Coroutine, Iterable, Literal, Optional, Union
    Any,
    BinaryIO,
    Callable,
    Coroutine,
    Iterable,
    Literal,
    Optional,
    Union,
 )
 import httpx
 import openai
@ -33,14 +24,13 @@ from litellm.utils import (
    Message,
    ModelResponse,
    TextCompletionResponse,
    TranscriptionResponse,
    Usage,
    convert_to_model_response_object,
 )
-from ..types.llms.openai import *
+from ...types.llms.openai import *
-from .base import BaseLLM
+from ..base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 class OpenAIError(Exception):
@ -1608,168 +1598,6 @@ class OpenAIChatCompletion(BaseLLM):
            else:
                raise OpenAIError(status_code=500, message=str(e))
    # Audio Transcriptions
    async def make_openai_audio_transcriptions_request(
        self,
        openai_aclient: AsyncOpenAI,
        data: dict,
        timeout: Union[float, httpx.Timeout],
    ):
        """
        Helper to:
        - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
        - call openai_aclient.audio.transcriptions.create by default
        """
        try:
            if litellm.return_response_headers is True:
                raw_response = (
                    await openai_aclient.audio.transcriptions.with_raw_response.create(
                        **data, timeout=timeout
                    )
                )  # type: ignore
                headers = dict(raw_response.headers)
                response = raw_response.parse()
                return headers, response
            else:
                response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
    def make_sync_openai_audio_transcriptions_request(
        self,
        openai_client: OpenAI,
        data: dict,
        timeout: Union[float, httpx.Timeout],
    ):
        """
        Helper to:
        - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
        - call openai_aclient.audio.transcriptions.create by default
        """
        try:
            if litellm.return_response_headers is True:
                raw_response = (
                    openai_client.audio.transcriptions.with_raw_response.create(
                        **data, timeout=timeout
                    )
                )  # type: ignore
                headers = dict(raw_response.headers)
                response = raw_response.parse()
                return headers, response
            else:
                response = openai_client.audio.transcriptions.create(**data, timeout=timeout)  # type: ignore
                return None, response
        except Exception as e:
            raise e
    def audio_transcriptions(
        self,
        model: str,
        audio_file: BinaryIO,
        optional_params: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        max_retries: int,
        api_key: Optional[str],
        api_base: Optional[str],
        client=None,
        logging_obj=None,
        atranscription: bool = False,
    ):
        data = {"model": model, "file": audio_file, **optional_params}
        if atranscription is True:
            return self.async_audio_transcriptions(
                audio_file=audio_file,
                data=data,
                model_response=model_response,
                timeout=timeout,
                api_key=api_key,
                api_base=api_base,
                client=client,
                max_retries=max_retries,
                logging_obj=logging_obj,
            )
        openai_client = self._get_openai_client(
            is_async=False,
            api_key=api_key,
            api_base=api_base,
            timeout=timeout,
            max_retries=max_retries,
        )
        _, response = self.make_sync_openai_audio_transcriptions_request(
            openai_client=openai_client,
            data=data,
            timeout=timeout,
        )
        if isinstance(response, BaseModel):
            stringified_response = response.model_dump()
        else:
            stringified_response = TranscriptionResponse(text=response).model_dump()
        ## LOGGING
        logging_obj.post_call(
            input=audio_file.name,
            api_key=api_key,
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
        hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
        self,
        audio_file: BinaryIO,
        data: dict,
        model_response: TranscriptionResponse,
        timeout: float,
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        client=None,
        max_retries=None,
    ):
        try:
            openai_aclient = self._get_openai_client(
                is_async=True,
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
            )
            headers, response = await self.make_openai_audio_transcriptions_request(
                openai_aclient=openai_aclient,
                data=data,
                timeout=timeout,
            )
            logging_obj.model_call_details["response_headers"] = headers
            if isinstance(response, BaseModel):
                stringified_response = response.model_dump()
            else:
                stringified_response = TranscriptionResponse(text=response).model_dump()
            ## LOGGING
            logging_obj.post_call(
                input=audio_file.name,
                api_key=api_key,
                additional_args={"complete_input_dict": data},
                original_response=stringified_response,
            )
            hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
            return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=input,
                api_key=api_key,
                original_response=str(e),
            )
            raise e
    def audio_speech(
        self,
        model: str,
--- a/litellm/llms/azure_text.py
+++ b/litellm/llms/azure_text.py
@ -1,7 +1,7 @@
 import json
 import types  # type: ignore
 import uuid
-from typing import Any, BinaryIO, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
 import httpx
 import requests
@ -19,8 +19,8 @@ from litellm.utils import (
    convert_to_model_response_object,
 )
 from ..llms.openai import OpenAITextCompletion, OpenAITextCompletionConfig
 from .base import BaseLLM
 from .OpenAI.openai import OpenAITextCompletion, OpenAITextCompletionConfig
 from .prompt_templates.factory import custom_prompt, prompt_factory
 openai_text_completion_config = OpenAITextCompletionConfig()
--- a/litellm/main.py
+++ b/litellm/main.py
@ -22,18 +22,7 @@ import uuid
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from functools import partial
-from typing import (
+from typing import Any, Callable, Dict, List, Literal, Mapping, Optional, Type, Union
    Any,
    BinaryIO,
    Callable,
    Dict,
    List,
    Literal,
    Mapping,
    Optional,
    Type,
    Union,
 )
 import dotenv
 import httpx
@ -93,8 +82,9 @@ from .llms import (
 from .llms.AI21 import completion as ai21
 from .llms.anthropic.chat import AnthropicChatCompletion
 from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.azure_text import AzureTextCompletion
 from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
 from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.bedrock import image_generation as bedrock_image_generation  # type: ignore
 from .llms.bedrock.chat import BedrockConverseLLM, BedrockLLM
 from .llms.bedrock.embed.embedding import BedrockEmbedding
@ -104,7 +94,8 @@ from .llms.cohere import embed as cohere_embed
 from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
-from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
+from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
 from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.predibase import PredibaseChatCompletion
 from .llms.prompt_templates.factory import (
    custom_prompt,
@ -146,6 +137,7 @@ from .types.llms.openai import HttpxBinaryResponseContent
 from .types.utils import (
    AdapterCompletionStreamWrapper,
    ChatCompletionMessageToolCall,
    FileTypes,
    HiddenParams,
    all_litellm_params,
 )
@ -169,11 +161,13 @@ from litellm.utils import (
 ####### ENVIRONMENT VARIABLES ###################
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 azure_audio_transcriptions = AzureAudioTranscription()
 huggingface = Huggingface()
 predibase_chat_completions = PredibaseChatCompletion()
 codestral_text_completions = CodestralTextCompletion()
@ -4614,7 +4608,7 @@ async def atranscription(*args, **kwargs) -> TranscriptionResponse:
@client
 def transcription(
    model: str,
-    file: BinaryIO,
+    file: FileTypes,
    ## OPTIONAL OPENAI PARAMS ##
    language: Optional[str] = None,
    prompt: Optional[str] = None,
@ -4704,7 +4698,7 @@ def transcription(
            or get_secret("AZURE_API_KEY")
        )  # type: ignore
-        response = azure_chat_completions.audio_transcriptions(
+        response = azure_audio_transcriptions.audio_transcriptions(
            model=model,
            audio_file=file,
            optional_params=optional_params,
@ -4738,7 +4732,7 @@ def transcription(
            or litellm.openai_key
            or get_secret("OPENAI_API_KEY")
        )  # type: ignore
-        response = openai_chat_completions.audio_transcriptions(
+        response = openai_audio_transcriptions.audio_transcriptions(
            model=model,
            audio_file=file,
            optional_params=optional_params,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -24,18 +24,7 @@ import traceback
 import uuid
 from collections import defaultdict
 from datetime import datetime
-from typing import (
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
    Any,
    BinaryIO,
    Dict,
    Iterable,
    List,
    Literal,
    Optional,
    Tuple,
    TypedDict,
    Union,
 )
 import httpx
 import openai
@ -48,7 +37,7 @@ from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
-from litellm.llms.azure import get_azure_ad_token_from_oidc
+from litellm.llms.AzureOpenAI.azure import get_azure_ad_token_from_oidc
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
@ -1342,7 +1331,7 @@ class Router:
                self.fail_calls[model_name] += 1
            raise e
-    async def atranscription(self, file: BinaryIO, model: str, **kwargs):
+    async def atranscription(self, file: FileTypes, model: str, **kwargs):
        """
        Example Usage:
@ -1386,7 +1375,7 @@ class Router:
            )
            raise e
-    async def _atranscription(self, file: BinaryIO, model: str, **kwargs):
+    async def _atranscription(self, file: FileTypes, model: str, **kwargs):
        try:
            verbose_router_logger.debug(
                f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -1,16 +1,5 @@
 from os import PathLike
-from typing import (
+from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union
    IO,
    Any,
    BinaryIO,
    Iterable,
    List,
    Literal,
    Mapping,
    Optional,
    Tuple,
    Union,
 )
 from openai._legacy_response import HttpxBinaryResponseContent
 from openai.lib.streaming._assistants import (
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -5,6 +5,7 @@ from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 from openai._models import BaseModel as OpenAIObject
 from openai.types.audio.transcription_create_params import FileTypes
 from openai.types.completion_usage import CompletionUsage
 from pydantic import ConfigDict, Field, PrivateAttr
 from typing_extensions import Callable, Dict, Required, TypedDict, override
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -58,7 +58,7 @@ import litellm.litellm_core_utils
 import litellm.litellm_core_utils.json_validation_rule
 from litellm.caching import DualCache
 from litellm.litellm_core_utils.core_helpers import (
-    get_file_check_sum,
+    get_audio_file_name,
    map_finish_reason,
 )
 from litellm.litellm_core_utils.exception_mapping_utils import get_error_message
@ -86,6 +86,7 @@ from litellm.types.utils import (
    Delta,
    Embedding,
    EmbeddingResponse,
    FileTypes,
    ImageResponse,
    Message,
    ModelInfo,
@ -161,7 +162,6 @@ except Exception as e:
 from concurrent.futures import ThreadPoolExecutor
 from typing import (
    Any,
    BinaryIO,
    Callable,
    Dict,
    Iterable,
@ -566,14 +566,13 @@ def function_setup(
            call_type == CallTypes.atranscription.value
            or call_type == CallTypes.transcription.value
        ):
-            _file_name: BinaryIO = args[1] if len(args) > 1 else kwargs["file"]
+            _file_obj: FileTypes = args[1] if len(args) > 1 else kwargs["file"]
-            file_checksum = get_file_check_sum(_file=_file_name)
+            file_checksum = get_audio_file_name(file_obj=_file_obj)
            file_name = _file_name.name
            if "metadata" in kwargs:
                kwargs["metadata"]["file_checksum"] = file_checksum
            else:
                kwargs["metadata"] = {"file_checksum": file_checksum}
-            messages = file_name
+            messages = _file_obj
        elif (
            call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
        ):