docs on using vertex tts

2025-04-26 03:04:13 +00:00 · 2024-08-23 17:57:49 -07:00 · 2024-08-23 17:57:49 -07:00 · 8fada93fff
commit 8fada93fff
parent 225ff8432d
4 changed files with 36 additions and 25 deletions
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -1812,9 +1812,9 @@ response.stream_to_file(speech_file_path)
 1. Add model to config.yaml
 ```yaml
 model_list:
-  - model_name: multimodalembedding@001
+  - model_name: vertex-tts
    litellm_params:
-      model: vertex_ai/multimodalembedding@001
+      model: vertex_ai/ # Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param
      vertex_project: "adroit-crow-413218"
      vertex_location: "us-central1"
      vertex_credentials: adroit-crow-413218-a956eef1a2a8.json 
@ -1837,23 +1837,14 @@ import openai

 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")

-# # request sent to model set on litellm proxy, `litellm --model`
-response = client.embeddings.create(
-    model="multimodalembedding@001", 
-    input = None,
-    extra_body = {
-        "instances": [
-        {
-            "image": {
-                "bytesBase64Encoded": "base64"
-            },
-            "text": "this is a unicorn",
-        },
-    ],
-    }
+# see supported values for "voice" on vertex here: 
+# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
+response = client.audio.speech.create(
+    model = "vertex-tts",
+    input="the quick brown fox jumped over the lazy dogs",
+    voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}
 )
-
-print(response)
+print("response from proxy", response)
 ```

 </TabItem>
--- a/litellm/llms/text_to_speech/vertex_ai.py
+++ b/litellm/llms/text_to_speech/vertex_ai.py
@ -54,7 +54,7 @@ class VertexTextToSpeechAPI(VertexLLM):
        timeout: Union[float, httpx.Timeout],
        model: str,
        input: str,
-        voice: Optional[str] = None,
+        voice: Optional[dict] = None,
        _is_async: Optional[bool] = False,
        optional_params: Optional[dict] = None,
        **kwargs,
@ -87,7 +87,9 @@ class VertexTextToSpeechAPI(VertexLLM):
        vertex_input = VertexInput(text=input)
        # required param
        optional_params = optional_params or {}
-        if "voice" in optional_params:
+        if voice is not None:
+            vertex_voice = VertexVoice(**voice)
+        elif "voice" in optional_params:
            vertex_voice = VertexVoice(**optional_params["voice"])
        else:
            # use defaults to not fail the request
--- a/litellm/main.py
+++ b/litellm/main.py
@ -4699,7 +4699,7 @@ async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
 def speech(
    model: str,
    input: str,
-    voice: Optional[str] = None,
+    voice: Optional[Union[str, dict]] = None,
    api_key: Optional[str] = None,
    api_base: Optional[str] = None,
    api_version: Optional[str] = None,
@ -4735,9 +4735,9 @@ def speech(
    logging_obj = kwargs.get("litellm_logging_obj", None)
    response: Optional[HttpxBinaryResponseContent] = None
    if custom_llm_provider == "openai":
-        if voice is None:
+        if voice is None or not (isinstance(voice, str)):
            raise litellm.BadRequestError(
-                message="'voice' is required for OpenAI TTS",
+                message="'voice' is required to be passed as a string for OpenAI TTS",
                model=model,
                llm_provider=custom_llm_provider,
            )
@ -4787,9 +4787,9 @@ def speech(
        )
    elif custom_llm_provider == "azure":
        # azure configs
-        if voice is None:
+        if voice is None or not (isinstance(voice, str)):
            raise litellm.BadRequestError(
-                message="'voice' is required for Azure TTS",
+                message="'voice' is required to be passed as a string for Azure TTS",
                model=model,
                llm_provider=custom_llm_provider,
            )
@ -4849,6 +4849,13 @@ def speech(
        vertex_credentials = generic_optional_params.vertex_credentials or get_secret(
            "VERTEXAI_CREDENTIALS"
        )
+
+        if voice is not None and not isinstance(voice, dict):
+            raise litellm.BadRequestError(
+                message=f"'voice' is required to be passed as a dict for Vertex AI TTS, passed in voice={voice}",
+                model=model,
+                llm_provider=custom_llm_provider,
+            )
        response = vertex_text_to_speech.audio_speech(
            _is_async=aspeech,
            vertex_credentials=vertex_credentials,
--- a/litellm/proxy/tests/test_openai_tts_request.py
+++ b/litellm/proxy/tests/test_openai_tts_request.py
@ -0,0 +1,11 @@
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.audio.speech.create(
+    model="vertex-tts",
+    input="the quick brown fox jumped over the lazy dogs",
+    voice={"languageCode": "en-US", "name": "en-US-Studio-O"},  # type: ignore
+)
+print("response from proxy", response)  # noqa