From a67cbf47f6a905e69f85ee1b27289bb48a25be2b Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 30 May 2024 14:28:28 -0700
Subject: [PATCH 1/3] feat(main.py): support openai tts endpoint

Closes https://github.com/BerriAI/litellm/issues/3094
---
 litellm/__init__.py                |   3 +-
 litellm/llms/openai.py             |  89 +++++++++++++++++++
 litellm/main.py                    | 134 +++++++++++++++++++++++++++++
 litellm/tests/test_audio_speech.py |  91 ++++++++++++++++++++
 litellm/types/llms/openai.py       |   8 +-
 5 files changed, 322 insertions(+), 3 deletions(-)
 create mode 100644 litellm/tests/test_audio_speech.py

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 026eeb833..9fa801318 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -227,7 +227,7 @@ default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
 max_end_user_budget: Optional[float] = None
 #### RELIABILITY ####
-request_timeout: Optional[float] = 6000
+request_timeout: float = 6000
 num_retries: Optional[int] = None  # per model endpoint
 default_fallbacks: Optional[List] = None
 fallbacks: Optional[List] = None
@@ -304,6 +304,7 @@ api_base = None
 headers = None
 api_version = None
 organization = None
+project = None
 config_path = None
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models: List = []
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 84d9c773f..4f1649651 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -26,6 +26,7 @@ import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from openai import OpenAI, AsyncOpenAI
 from ..types.llms.openai import *
+import openai
 
 
 class OpenAIError(Exception):
@@ -1180,6 +1181,94 @@ class OpenAIChatCompletion(BaseLLM):
             )
             raise e
 
+    def audio_speech(
+        self,
+        model: str,
+        input: str,
+        voice: str,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        organization: Optional[str],
+        project: Optional[str],
+        max_retries: int,
+        timeout: Union[float, httpx.Timeout],
+        aspeech: Optional[bool] = None,
+        client=None,
+    ) -> ResponseContextManager[StreamedBinaryAPIResponse]:
+
+        if aspeech is not None and aspeech == True:
+            return self.async_audio_speech(
+                model=model,
+                input=input,
+                voice=voice,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                organization=organization,
+                project=project,
+                max_retries=max_retries,
+                timeout=timeout,
+                client=client,
+            )  # type: ignore
+
+        if client is None:
+            openai_client = OpenAI(
+                api_key=api_key,
+                base_url=api_base,
+                organization=organization,
+                project=project,
+                http_client=litellm.client_session,
+                timeout=timeout,
+                max_retries=max_retries,
+            )
+        else:
+            openai_client = client
+
+        response = openai_client.audio.speech.with_streaming_response.create(
+            model="tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            **optional_params,
+        )
+        return response
+
+    def async_audio_speech(
+        self,
+        model: str,
+        input: str,
+        voice: str,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        organization: Optional[str],
+        project: Optional[str],
+        max_retries: int,
+        timeout: Union[float, httpx.Timeout],
+        client=None,
+    ) -> AsyncResponseContextManager[AsyncStreamedBinaryAPIResponse]:
+
+        if client is None:
+            openai_client = AsyncOpenAI(
+                api_key=api_key,
+                base_url=api_base,
+                organization=organization,
+                project=project,
+                http_client=litellm.aclient_session,
+                timeout=timeout,
+                max_retries=max_retries,
+            )
+        else:
+            openai_client = client
+
+        response = openai_client.audio.speech.with_streaming_response.create(
+            model="tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            **optional_params,
+        )
+        return response
+
     async def ahealth_check(
         self,
         model: Optional[str],
diff --git a/litellm/main.py b/litellm/main.py
index d50694697..458e9bd8c 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -91,6 +91,12 @@ import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, List, Optional, Dict, Union, Mapping
 from .caching import enable_cache, disable_cache, update_cache
+from .types.llms.openai import (
+    StreamedBinaryAPIResponse,
+    ResponseContextManager,
+    AsyncResponseContextManager,
+    AsyncStreamedBinaryAPIResponse,
+)
 
 encoding = tiktoken.get_encoding("cl100k_base")
 from litellm.utils import (
@@ -4163,6 +4169,134 @@ def transcription(
     return response
 
 
+def aspeech(
+    *args, **kwargs
+) -> AsyncResponseContextManager[AsyncStreamedBinaryAPIResponse]:
+    """
+    Calls openai tts endpoints.
+    """
+    loop = asyncio.get_event_loop()
+    model = args[0] if len(args) > 0 else kwargs["model"]
+    ### PASS ARGS TO Image Generation ###
+    kwargs["aspeech"] = True
+    custom_llm_provider = kwargs.get("custom_llm_provider", None)
+    try:
+        # # Use a partial function to pass your keyword arguments
+        # func = partial(speech, *args, **kwargs)
+
+        # # Add the context to the function
+        # ctx = contextvars.copy_context()
+        # func_with_context = partial(ctx.run, func)
+
+        # _, custom_llm_provider, _, _ = get_llm_provider(
+        #     model=model, api_base=kwargs.get("api_base", None)
+        # )
+
+        # # Await normally
+        # init_response = await loop.run_in_executor(None, func_with_context)
+        # if asyncio.iscoroutine(init_response):
+        #     response = await init_response
+        # else:
+        #     # Call the synchronous function using run_in_executor
+        #     response = await loop.run_in_executor(None, func_with_context)
+        return speech(*args, **kwargs)  # type: ignore
+    except Exception as e:
+        custom_llm_provider = custom_llm_provider or "openai"
+        raise exception_type(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs=args,
+            extra_kwargs=kwargs,
+        )
+
+
+def speech(
+    model: str,
+    input: str,
+    voice: str,
+    optional_params: dict,
+    api_key: Optional[str],
+    api_base: Optional[str],
+    organization: Optional[str],
+    project: Optional[str],
+    max_retries: int,
+    timeout: Optional[Union[float, httpx.Timeout]] = None,
+    response_format: Optional[str] = None,
+    speed: Optional[int] = None,
+    client=None,
+    headers: Optional[dict] = None,
+    custom_llm_provider: Optional[str] = None,
+    aspeech: Optional[bool] = None,
+) -> ResponseContextManager[StreamedBinaryAPIResponse]:
+
+    model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+
+    optional_params = {}
+    if response_format is not None:
+        optional_params["response_format"] = response_format
+    if speed is not None:
+        optional_params["speed"] = speed
+
+    if timeout is None:
+        timeout = litellm.request_timeout
+
+    response: Optional[ResponseContextManager[StreamedBinaryAPIResponse]] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or get_secret("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )  # type: ignore
+        # set API KEY
+        api_key = (
+            api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or get_secret("OPENAI_API_KEY")
+        )  # type: ignore
+
+        organization = (
+            organization
+            or litellm.organization
+            or get_secret("OPENAI_ORGANIZATION")
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )  # type: ignore
+
+        project = (
+            project
+            or litellm.project
+            or get_secret("OPENAI_PROJECT")
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )  # type: ignore
+
+        headers = headers or litellm.headers
+
+        response = openai_chat_completions.audio_speech(
+            model=model,
+            input=input,
+            voice=voice,
+            optional_params=optional_params,
+            api_key=api_key,
+            api_base=api_base,
+            organization=organization,
+            project=project,
+            max_retries=max_retries,
+            timeout=timeout,
+            client=client,  # pass AsyncOpenAI, OpenAI client
+            aspeech=aspeech,
+        )
+
+    if response is None:
+        raise Exception(
+            "Unable to map the custom llm provider={} to a known provider={}.".format(
+                custom_llm_provider, litellm.provider_list
+            )
+        )
+    return response
+
+
 ##### Health Endpoints #######################
 
 
diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py
new file mode 100644
index 000000000..9cc6269de
--- /dev/null
+++ b/litellm/tests/test_audio_speech.py
@@ -0,0 +1,91 @@
+# What is this?
+## unit tests for openai tts endpoint
+
+import sys, os, asyncio, time, random, uuid
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import litellm, openai
+from pathlib import Path
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_audio_speech_openai(sync_mode):
+
+    speech_file_path = Path(__file__).parent / "speech.mp3"
+    openai_chat_completions = litellm.OpenAIChatCompletion()
+    if sync_mode:
+        with openai_chat_completions.audio_speech(
+            model="tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            api_base=None,
+            api_key=None,
+            organization=None,
+            project=None,
+            max_retries=1,
+            timeout=600,
+            client=None,
+            optional_params={},
+        ) as response:
+            response.stream_to_file(speech_file_path)
+    else:
+        async with openai_chat_completions.async_audio_speech(
+            model="tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            api_base=None,
+            api_key=None,
+            organization=None,
+            project=None,
+            max_retries=1,
+            timeout=600,
+            client=None,
+            optional_params={},
+        ) as response:
+            speech = await response.parse()
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_audio_speech_litellm(sync_mode):
+    speech_file_path = Path(__file__).parent / "speech.mp3"
+
+    if sync_mode:
+        with litellm.speech(
+            model="openai/tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            api_base=None,
+            api_key=None,
+            organization=None,
+            project=None,
+            max_retries=1,
+            timeout=600,
+            client=None,
+            optional_params={},
+        ) as response:
+            response.stream_to_file(speech_file_path)
+    else:
+        async with litellm.aspeech(
+            model="openai/tts-1",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            api_base=None,
+            api_key=None,
+            organization=None,
+            project=None,
+            max_retries=1,
+            timeout=600,
+            client=None,
+            optional_params={},
+        ) as response:
+            await response.stream_to_file(speech_file_path)
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index 57c199b61..2ea72a76e 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -8,7 +8,6 @@ from typing import (
 )
 from typing_extensions import override, Required, Dict
 from pydantic import BaseModel
-
 from openai.types.beta.threads.message_content import MessageContent
 from openai.types.beta.threads.message import Message as OpenAIMessage
 from openai.types.beta.thread_create_params import (
@@ -21,7 +20,12 @@ from openai.pagination import SyncCursorPage
 from os import PathLike
 from openai.types import FileObject, Batch
 from openai._legacy_response import HttpxBinaryResponseContent
-
+from openai._response import (
+    StreamedBinaryAPIResponse,
+    ResponseContextManager,
+    AsyncStreamedBinaryAPIResponse,
+    AsyncResponseContextManager,
+)
 from typing import TypedDict, List, Optional, Tuple, Mapping, IO
 
 FileContent = Union[IO[bytes], bytes, PathLike]

From 93166cdabf4661568c85081ec92958e719081782 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 30 May 2024 16:41:06 -0700
Subject: [PATCH 2/3] fix(openai.py): fix openai response for `/audio/speech`
 endpoint

---
 litellm/llms/openai.py                  |  23 ++--
 litellm/main.py                         |  64 +++++------
 litellm/proxy/_super_secret_config.yaml |  34 +-----
 litellm/proxy/proxy_server.py           | 140 ++++++++++++++++++++++++
 litellm/router.py                       |  78 +++++++++++++
 litellm/tests/test_audio_speech.py      |  93 ++++++++--------
 litellm/types/llms/openai.py            |   6 -
 7 files changed, 311 insertions(+), 127 deletions(-)

diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 4f1649651..e68a50347 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1195,7 +1195,7 @@ class OpenAIChatCompletion(BaseLLM):
         timeout: Union[float, httpx.Timeout],
         aspeech: Optional[bool] = None,
         client=None,
-    ) -> ResponseContextManager[StreamedBinaryAPIResponse]:
+    ) -> HttpxBinaryResponseContent:
 
         if aspeech is not None and aspeech == True:
             return self.async_audio_speech(
@@ -1225,15 +1225,15 @@ class OpenAIChatCompletion(BaseLLM):
         else:
             openai_client = client
 
-        response = openai_client.audio.speech.with_streaming_response.create(
-            model="tts-1",
-            voice="alloy",
-            input="the quick brown fox jumped over the lazy dogs",
+        response = openai_client.audio.speech.create(
+            model=model,
+            voice=voice,  # type: ignore
+            input=input,
             **optional_params,
         )
         return response
 
-    def async_audio_speech(
+    async def async_audio_speech(
         self,
         model: str,
         input: str,
@@ -1246,7 +1246,7 @@ class OpenAIChatCompletion(BaseLLM):
         max_retries: int,
         timeout: Union[float, httpx.Timeout],
         client=None,
-    ) -> AsyncResponseContextManager[AsyncStreamedBinaryAPIResponse]:
+    ) -> HttpxBinaryResponseContent:
 
         if client is None:
             openai_client = AsyncOpenAI(
@@ -1261,12 +1261,13 @@ class OpenAIChatCompletion(BaseLLM):
         else:
             openai_client = client
 
-        response = openai_client.audio.speech.with_streaming_response.create(
-            model="tts-1",
-            voice="alloy",
-            input="the quick brown fox jumped over the lazy dogs",
+        response = await openai_client.audio.speech.create(
+            model=model,
+            voice=voice,  # type: ignore
+            input=input,
             **optional_params,
         )
+
         return response
 
     async def ahealth_check(
diff --git a/litellm/main.py b/litellm/main.py
index 458e9bd8c..2a07ae3c8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -91,12 +91,7 @@ import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, List, Optional, Dict, Union, Mapping
 from .caching import enable_cache, disable_cache, update_cache
-from .types.llms.openai import (
-    StreamedBinaryAPIResponse,
-    ResponseContextManager,
-    AsyncResponseContextManager,
-    AsyncStreamedBinaryAPIResponse,
-)
+from .types.llms.openai import HttpxBinaryResponseContent
 
 encoding = tiktoken.get_encoding("cl100k_base")
 from litellm.utils import (
@@ -4169,9 +4164,7 @@ def transcription(
     return response
 
 
-def aspeech(
-    *args, **kwargs
-) -> AsyncResponseContextManager[AsyncStreamedBinaryAPIResponse]:
+async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
     """
     Calls openai tts endpoints.
     """
@@ -4181,25 +4174,25 @@ def aspeech(
     kwargs["aspeech"] = True
     custom_llm_provider = kwargs.get("custom_llm_provider", None)
     try:
-        # # Use a partial function to pass your keyword arguments
-        # func = partial(speech, *args, **kwargs)
+        # Use a partial function to pass your keyword arguments
+        func = partial(speech, *args, **kwargs)
 
-        # # Add the context to the function
-        # ctx = contextvars.copy_context()
-        # func_with_context = partial(ctx.run, func)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
 
-        # _, custom_llm_provider, _, _ = get_llm_provider(
-        #     model=model, api_base=kwargs.get("api_base", None)
-        # )
+        _, custom_llm_provider, _, _ = get_llm_provider(
+            model=model, api_base=kwargs.get("api_base", None)
+        )
 
-        # # Await normally
-        # init_response = await loop.run_in_executor(None, func_with_context)
-        # if asyncio.iscoroutine(init_response):
-        #     response = await init_response
-        # else:
-        #     # Call the synchronous function using run_in_executor
-        #     response = await loop.run_in_executor(None, func_with_context)
-        return speech(*args, **kwargs)  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            # Call the synchronous function using run_in_executor
+            response = await loop.run_in_executor(None, func_with_context)
+        return response  # type: ignore
     except Exception as e:
         custom_llm_provider = custom_llm_provider or "openai"
         raise exception_type(
@@ -4215,12 +4208,12 @@ def speech(
     model: str,
     input: str,
     voice: str,
-    optional_params: dict,
-    api_key: Optional[str],
-    api_base: Optional[str],
-    organization: Optional[str],
-    project: Optional[str],
-    max_retries: int,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    organization: Optional[str] = None,
+    project: Optional[str] = None,
+    max_retries: Optional[int] = None,
+    metadata: Optional[dict] = None,
     timeout: Optional[Union[float, httpx.Timeout]] = None,
     response_format: Optional[str] = None,
     speed: Optional[int] = None,
@@ -4228,7 +4221,8 @@ def speech(
     headers: Optional[dict] = None,
     custom_llm_provider: Optional[str] = None,
     aspeech: Optional[bool] = None,
-) -> ResponseContextManager[StreamedBinaryAPIResponse]:
+    **kwargs,
+) -> HttpxBinaryResponseContent:
 
     model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
 
@@ -4236,12 +4230,14 @@ def speech(
     if response_format is not None:
         optional_params["response_format"] = response_format
     if speed is not None:
-        optional_params["speed"] = speed
+        optional_params["speed"] = speed  # type: ignore
 
     if timeout is None:
         timeout = litellm.request_timeout
 
-    response: Optional[ResponseContextManager[StreamedBinaryAPIResponse]] = None
+    if max_retries is None:
+        max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES
+    response: Optional[HttpxBinaryResponseContent] = None
     if custom_llm_provider == "openai":
         api_base = (
             api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml
index 95cf05e71..9c9982428 100644
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@@ -1,31 +1,3 @@
-general_settings:
-  alert_to_webhook_url:
-    budget_alerts: https://hooks.slack.com/services/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
-    daily_reports: https://hooks.slack.com/services/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
-    db_exceptions: https://hooks.slack.com/services/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
-    llm_exceptions: https://hooks.slack.com/services/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
-    llm_requests_hanging: https://hooks.slack.com/services/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
-    llm_too_slow: https://hooks.slack.com/services/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
-    outage_alerts: https://hooks.slack.com/services/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
-  alert_types:
-  - llm_exceptions
-  - llm_too_slow
-  - llm_requests_hanging
-  - budget_alerts
-  - db_exceptions
-  - daily_reports
-  - spend_reports
-  - cooldown_deployment
-  - new_model_added
-  - outage_alerts
-  alerting:
-  - slack
-  database_connection_pool_limit: 100
-  database_connection_timeout: 60
-  health_check_interval: 300
-  ui_access_mode: all
-# litellm_settings:
-#   json_logs: true
 model_list:
 - litellm_params:
     api_base: http://0.0.0.0:8080
@@ -52,10 +24,8 @@ model_list:
     api_version: '2023-05-15'
     model: azure/chatgpt-v-2
   model_name: gpt-3.5-turbo
-- model_name: mistral
+- model_name: tts
   litellm_params:
-    model: azure/mistral-large-latest
-    api_base: https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/
-    api_key: zEJhgmw1FAKk0XzPWoLEg7WU1cXbWYYn
+    model: openai/tts-1
 router_settings:
   enable_pre_call_checks: true
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 56aa1b35e..e2a3425f2 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -79,6 +79,9 @@ def generate_feedback_box():
 
 
 import litellm
+from litellm.types.llms.openai import (
+    HttpxBinaryResponseContent,
+)
 from litellm.proxy.utils import (
     PrismaClient,
     DBClient,
@@ -4875,6 +4878,143 @@ async def image_generation(
             )
 
 
+@router.post(
+    "/v1/audio/speech",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["audio"],
+)
+@router.post(
+    "/audio/speech",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["audio"],
+)
+async def audio_speech(
+    request: Request,
+    fastapi_response: Response,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Same params as:
+
+    https://platform.openai.com/docs/api-reference/audio/createSpeech
+    """
+    global proxy_logging_obj
+    data: Dict = {}
+    try:
+        # Use orjson to parse JSON data, orjson speeds up requests significantly
+        body = await request.body()
+        data = orjson.loads(body)
+
+        # Include original request and headers in the data
+        data["proxy_server_request"] = {  # type: ignore
+            "url": str(request.url),
+            "method": request.method,
+            "headers": dict(request.headers),
+            "body": copy.copy(data),  # use copy instead of deepcopy
+        }
+
+        if data.get("user", None) is None and user_api_key_dict.user_id is not None:
+            data["user"] = user_api_key_dict.user_id
+
+        if user_model:
+            data["model"] = user_model
+
+        if "metadata" not in data:
+            data["metadata"] = {}
+        data["metadata"]["user_api_key"] = user_api_key_dict.api_key
+        data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
+        _headers = dict(request.headers)
+        _headers.pop(
+            "authorization", None
+        )  # do not store the original `sk-..` api key in the db
+        data["metadata"]["headers"] = _headers
+        data["metadata"]["user_api_key_alias"] = getattr(
+            user_api_key_dict, "key_alias", None
+        )
+        data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
+        data["metadata"]["user_api_key_team_id"] = getattr(
+            user_api_key_dict, "team_id", None
+        )
+        data["metadata"]["global_max_parallel_requests"] = general_settings.get(
+            "global_max_parallel_requests", None
+        )
+        data["metadata"]["user_api_key_team_alias"] = getattr(
+            user_api_key_dict, "team_alias", None
+        )
+        data["metadata"]["endpoint"] = str(request.url)
+
+        ### TEAM-SPECIFIC PARAMS ###
+        if user_api_key_dict.team_id is not None:
+            team_config = await proxy_config.load_team_config(
+                team_id=user_api_key_dict.team_id
+            )
+            if len(team_config) == 0:
+                pass
+            else:
+                team_id = team_config.pop("team_id", None)
+                data["metadata"]["team_id"] = team_id
+                data = {
+                    **team_config,
+                    **data,
+                }  # add the team-specific configs to the completion call
+
+        router_model_names = llm_router.model_names if llm_router is not None else []
+
+        ### CALL HOOKS ### - modify incoming data / reject request before calling the model
+        data = await proxy_logging_obj.pre_call_hook(
+            user_api_key_dict=user_api_key_dict, data=data, call_type="image_generation"
+        )
+
+        ## ROUTE TO CORRECT ENDPOINT ##
+        # skip router if user passed their key
+        if "api_key" in data:
+            response = await litellm.aspeech(**data)
+        elif (
+            llm_router is not None and data["model"] in router_model_names
+        ):  # model in router model list
+            response = await llm_router.aspeech(**data)
+        elif (
+            llm_router is not None and data["model"] in llm_router.deployment_names
+        ):  # model in router deployments, calling a specific deployment on the router
+            response = await llm_router.aspeech(**data, specific_deployment=True)
+        elif (
+            llm_router is not None
+            and llm_router.model_group_alias is not None
+            and data["model"] in llm_router.model_group_alias
+        ):  # model set in model_group_alias
+            response = await llm_router.aspeech(
+                **data
+            )  # ensure this goes the llm_router, router will do the correct alias mapping
+        elif (
+            llm_router is not None
+            and data["model"] not in router_model_names
+            and llm_router.default_deployment is not None
+        ):  # model in router deployments, calling a specific deployment on the router
+            response = await llm_router.aspeech(**data)
+        elif user_model is not None:  # `litellm --model <your-model-name>`
+            response = await litellm.aspeech(**data)
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail={
+                    "error": "audio_speech: Invalid model name passed in model="
+                    + data.get("model", "")
+                },
+            )
+
+        # Printing each chunk size
+        async def generate(_response: HttpxBinaryResponseContent):
+            _generator = await _response.aiter_bytes(chunk_size=1024)
+            async for chunk in _generator:
+                yield chunk
+
+        return StreamingResponse(generate(response), media_type="audio/mpeg")
+
+    except Exception as e:
+        traceback.print_exc()
+        raise e
+
+
 @router.post(
     "/v1/audio/transcriptions",
     dependencies=[Depends(user_api_key_auth)],
diff --git a/litellm/router.py b/litellm/router.py
index d7535a83a..4474864c3 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -1202,6 +1202,84 @@ class Router:
                 self.fail_calls[model_name] += 1
             raise e
 
+    async def aspeech(self, model: str, input: str, voice: str, **kwargs):
+        """
+        Example Usage:
+
+        ```
+        from litellm import Router
+        client = Router(model_list = [
+            {
+                "model_name": "tts",
+                "litellm_params": {
+                    "model": "tts-1",
+                },
+            },
+        ])
+
+        async with client.aspeech(
+            model="tts",
+            voice="alloy",
+            input="the quick brown fox jumped over the lazy dogs",
+            api_base=None,
+            api_key=None,
+            organization=None,
+            project=None,
+            max_retries=1,
+            timeout=600,
+            client=None,
+            optional_params={},
+        ) as response:
+            response.stream_to_file(speech_file_path)
+
+        ```
+        """
+        try:
+            kwargs["input"] = input
+            kwargs["voice"] = voice
+
+            deployment = await self.async_get_available_deployment(
+                model=model,
+                messages=[{"role": "user", "content": "prompt"}],
+                specific_deployment=kwargs.pop("specific_deployment", None),
+            )
+            kwargs.setdefault("metadata", {}).update(
+                {
+                    "deployment": deployment["litellm_params"]["model"],
+                    "model_info": deployment.get("model_info", {}),
+                }
+            )
+            kwargs["model_info"] = deployment.get("model_info", {})
+            data = deployment["litellm_params"].copy()
+            model_name = data["model"]
+            for k, v in self.default_litellm_params.items():
+                if (
+                    k not in kwargs
+                ):  # prioritize model-specific params > default router params
+                    kwargs[k] = v
+                elif k == "metadata":
+                    kwargs[k].update(v)
+
+            potential_model_client = self._get_client(
+                deployment=deployment, kwargs=kwargs, client_type="async"
+            )
+            # check if provided keys == client keys #
+            dynamic_api_key = kwargs.get("api_key", None)
+            if (
+                dynamic_api_key is not None
+                and potential_model_client is not None
+                and dynamic_api_key != potential_model_client.api_key
+            ):
+                model_client = None
+            else:
+                model_client = potential_model_client
+
+            response = await litellm.aspeech(**data, **kwargs)
+
+            return response
+        except Exception as e:
+            raise e
+
     async def amoderation(self, model: str, input: str, **kwargs):
         try:
             kwargs["model"] = model
diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py
index 9cc6269de..dde196d9c 100644
--- a/litellm/tests/test_audio_speech.py
+++ b/litellm/tests/test_audio_speech.py
@@ -16,51 +16,13 @@ import litellm, openai
 from pathlib import Path
 
 
-@pytest.mark.parametrize("sync_mode", [True, False])
-@pytest.mark.asyncio
-async def test_audio_speech_openai(sync_mode):
-
-    speech_file_path = Path(__file__).parent / "speech.mp3"
-    openai_chat_completions = litellm.OpenAIChatCompletion()
-    if sync_mode:
-        with openai_chat_completions.audio_speech(
-            model="tts-1",
-            voice="alloy",
-            input="the quick brown fox jumped over the lazy dogs",
-            api_base=None,
-            api_key=None,
-            organization=None,
-            project=None,
-            max_retries=1,
-            timeout=600,
-            client=None,
-            optional_params={},
-        ) as response:
-            response.stream_to_file(speech_file_path)
-    else:
-        async with openai_chat_completions.async_audio_speech(
-            model="tts-1",
-            voice="alloy",
-            input="the quick brown fox jumped over the lazy dogs",
-            api_base=None,
-            api_key=None,
-            organization=None,
-            project=None,
-            max_retries=1,
-            timeout=600,
-            client=None,
-            optional_params={},
-        ) as response:
-            speech = await response.parse()
-
-
 @pytest.mark.parametrize("sync_mode", [True, False])
 @pytest.mark.asyncio
 async def test_audio_speech_litellm(sync_mode):
     speech_file_path = Path(__file__).parent / "speech.mp3"
 
     if sync_mode:
-        with litellm.speech(
+        response = litellm.speech(
             model="openai/tts-1",
             voice="alloy",
             input="the quick brown fox jumped over the lazy dogs",
@@ -72,10 +34,13 @@ async def test_audio_speech_litellm(sync_mode):
             timeout=600,
             client=None,
             optional_params={},
-        ) as response:
-            response.stream_to_file(speech_file_path)
+        )
+
+        from litellm.llms.openai import HttpxBinaryResponseContent
+
+        assert isinstance(response, HttpxBinaryResponseContent)
     else:
-        async with litellm.aspeech(
+        response = await litellm.aspeech(
             model="openai/tts-1",
             voice="alloy",
             input="the quick brown fox jumped over the lazy dogs",
@@ -87,5 +52,45 @@ async def test_audio_speech_litellm(sync_mode):
             timeout=600,
             client=None,
             optional_params={},
-        ) as response:
-            await response.stream_to_file(speech_file_path)
+        )
+
+        from litellm.llms.openai import HttpxBinaryResponseContent
+
+        assert isinstance(response, HttpxBinaryResponseContent)
+
+
+@pytest.mark.parametrize("mode", ["iterator"])  # "file",
+@pytest.mark.asyncio
+async def test_audio_speech_router(mode):
+    speech_file_path = Path(__file__).parent / "speech.mp3"
+
+    from litellm import Router
+
+    client = Router(
+        model_list=[
+            {
+                "model_name": "tts",
+                "litellm_params": {
+                    "model": "openai/tts-1",
+                },
+            },
+        ]
+    )
+
+    response = await client.aspeech(
+        model="tts",
+        voice="alloy",
+        input="the quick brown fox jumped over the lazy dogs",
+        api_base=None,
+        api_key=None,
+        organization=None,
+        project=None,
+        max_retries=1,
+        timeout=600,
+        client=None,
+        optional_params={},
+    )
+
+    from litellm.llms.openai import HttpxBinaryResponseContent
+
+    assert isinstance(response, HttpxBinaryResponseContent)
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index 2ea72a76e..964ac9c9b 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -20,12 +20,6 @@ from openai.pagination import SyncCursorPage
 from os import PathLike
 from openai.types import FileObject, Batch
 from openai._legacy_response import HttpxBinaryResponseContent
-from openai._response import (
-    StreamedBinaryAPIResponse,
-    ResponseContextManager,
-    AsyncStreamedBinaryAPIResponse,
-    AsyncResponseContextManager,
-)
 from typing import TypedDict, List, Optional, Tuple, Mapping, IO
 
 FileContent = Union[IO[bytes], bytes, PathLike]

From d65b7fe01b1737eb75beb4868e93fa1695ce8c09 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 30 May 2024 16:57:11 -0700
Subject: [PATCH 3/3] fix(main.py): add logging to audio_transcription calls

---
 litellm/main.py                               |  2 ++
 .../out/{404.html => 404/index.html}          |  0
 .../{model_hub.html => model_hub/index.html}  |  0
 litellm/proxy/proxy_server.py                 | 28 ++++++++++++++++++-
 litellm/utils.py                              |  8 ++++++
 5 files changed, 37 insertions(+), 1 deletion(-)
 rename litellm/proxy/_experimental/out/{404.html => 404/index.html} (100%)
 rename litellm/proxy/_experimental/out/{model_hub.html => model_hub/index.html} (100%)

diff --git a/litellm/main.py b/litellm/main.py
index 2a07ae3c8..525a39d68 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -4164,6 +4164,7 @@ def transcription(
     return response
 
 
+@client
 async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
     """
     Calls openai tts endpoints.
@@ -4204,6 +4205,7 @@ async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
         )
 
 
+@client
 def speech(
     model: str,
     input: str,
diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404/index.html
similarity index 100%
rename from litellm/proxy/_experimental/out/404.html
rename to litellm/proxy/_experimental/out/404/index.html
diff --git a/litellm/proxy/_experimental/out/model_hub.html b/litellm/proxy/_experimental/out/model_hub/index.html
similarity index 100%
rename from litellm/proxy/_experimental/out/model_hub.html
rename to litellm/proxy/_experimental/out/model_hub/index.html
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index e2a3425f2..2bd08fb89 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -5002,13 +5002,39 @@ async def audio_speech(
                 },
             )
 
+        ### ALERTING ###
+        data["litellm_status"] = "success"  # used for alerting
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
         # Printing each chunk size
         async def generate(_response: HttpxBinaryResponseContent):
             _generator = await _response.aiter_bytes(chunk_size=1024)
             async for chunk in _generator:
                 yield chunk
 
-        return StreamingResponse(generate(response), media_type="audio/mpeg")
+        custom_headers = get_custom_headers(
+            user_api_key_dict=user_api_key_dict,
+            model_id=model_id,
+            cache_key=cache_key,
+            api_base=api_base,
+            version=version,
+            model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            fastest_response_batch_completion=None,
+        )
+
+        selected_data_generator = select_data_generator(
+            response=response,
+            user_api_key_dict=user_api_key_dict,
+            request_data=data,
+        )
+        return StreamingResponse(
+            generate(response), media_type="audio/mpeg", headers=custom_headers
+        )
 
     except Exception as e:
         traceback.print_exc()
diff --git a/litellm/utils.py b/litellm/utils.py
index 95d9160ef..b48ab9b15 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1136,6 +1136,8 @@ class CallTypes(Enum):
     amoderation = "amoderation"
     atranscription = "atranscription"
     transcription = "transcription"
+    aspeech = "aspeech"
+    speech = "speech"
 
 
 # Logging function -> log the exact model details + what's being sent | Non-BlockingP
@@ -3005,6 +3007,10 @@ def function_setup(
         ):
             _file_name: BinaryIO = args[1] if len(args) > 1 else kwargs["file"]
             messages = "audio_file"
+        elif (
+            call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
+        ):
+            messages = kwargs.get("input", "speech")
         stream = True if "stream" in kwargs and kwargs["stream"] == True else False
         logging_obj = Logging(
             model=model,
@@ -3346,6 +3352,8 @@ def client(original_function):
                 return result
             elif "atranscription" in kwargs and kwargs["atranscription"] == True:
                 return result
+            elif "aspeech" in kwargs and kwargs["aspeech"] == True:
+                return result
 
             ### POST-CALL RULES ###
             post_call_processing(original_response=result, model=model or None)