From 6d11b392f8786ccb287c543f5b02e655db93085a Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:17:53 -0700
Subject: [PATCH 1/7] add ssml input on vertex tts

---
 litellm/llms/text_to_speech/vertex_ai.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py
index b9fca5325..0b6b9deb8 100644
--- a/litellm/llms/text_to_speech/vertex_ai.py
+++ b/litellm/llms/text_to_speech/vertex_ai.py
@@ -19,7 +19,8 @@ from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gem
 
 
 class VertexInput(TypedDict, total=False):
-    text: str
+    text: Optional[str]
+    ssml: Optional[str]
 
 
 class VertexVoice(TypedDict, total=False):
@@ -86,10 +87,11 @@ class VertexTextToSpeechAPI(VertexLLM):
 
         ####### Build the request ################
         # API Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize
-        vertex_input = VertexInput(text=input)
+        kwargs = kwargs or {}
+        vertex_input = VertexInput(text=input, ssml=kwargs.get("ssml", None))
+        validate_vertex_input(vertex_input)
         # required param
         optional_params = optional_params or {}
-        kwargs = kwargs or {}
         if voice is not None:
             vertex_voice = VertexVoice(**voice)
         elif "voice" in kwargs:
@@ -203,3 +205,12 @@ class VertexTextToSpeechAPI(VertexLLM):
         # Initialize the HttpxBinaryResponseContent instance
         http_binary_response = HttpxBinaryResponseContent(response)
         return http_binary_response
+
+
+def validate_vertex_input(input_data: VertexInput) -> None:
+    if input_data.get("text", None) is None:
+        input_data.pop("text")
+    if "text" not in input_data and "ssml" not in input_data:
+        raise ValueError("Either 'text' or 'ssml' must be provided.")
+    if "text" in input_data and "ssml" in input_data:
+        raise ValueError("Only one of 'text' or 'ssml' should be provided, not both.")

From 52f098ef434f9ccd6ae631277aa8741febbaec7b Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:18:34 -0700
Subject: [PATCH 2/7] add vertex ssml test

---
 litellm/tests/test_audio_speech.py | 52 ++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py
index 5de996fa1..d9ed3fd6e 100644
--- a/litellm/tests/test_audio_speech.py
+++ b/litellm/tests/test_audio_speech.py
@@ -243,3 +243,55 @@ async def test_speech_litellm_vertex_async_with_voice():
             "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"},
             "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"},
         }
+
+
+@pytest.mark.asyncio
+async def test_speech_litellm_vertex_async_with_voice_ssml():
+    # Mock the response
+    mock_response = AsyncMock()
+
+    def return_val():
+        return {
+            "audioContent": "dGVzdCByZXNwb25zZQ==",
+        }
+
+    mock_response.json = return_val
+    mock_response.status_code = 200
+
+    # Set up the mock for asynchronous calls
+    with patch(
+        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
+        new_callable=AsyncMock,
+    ) as mock_async_post:
+        mock_async_post.return_value = mock_response
+        model = "vertex_ai/test"
+
+        response = await litellm.aspeech(
+            input=None,
+            model=model,
+            ssml="async hello what llm guardrail do you have",
+            voice={
+                "languageCode": "en-UK",
+                "name": "en-UK-Studio-O",
+            },
+            audioConfig={
+                "audioEncoding": "LINEAR22",
+                "speakingRate": "10",
+            },
+        )
+
+        # Assert asynchronous call
+        mock_async_post.assert_called_once()
+        _, kwargs = mock_async_post.call_args
+        print("call args", kwargs)
+
+        assert kwargs["url"] == "https://texttospeech.googleapis.com/v1/text:synthesize"
+
+        assert "x-goog-user-project" in kwargs["headers"]
+        assert kwargs["headers"]["Authorization"] is not None
+
+        assert kwargs["json"] == {
+            "input": {"ssml": "async hello what llm guardrail do you have"},
+            "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"},
+            "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"},
+        }

From 043919b27856c63b59c92c14f4e34fdc0d38c5f9 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:27:44 -0700
Subject: [PATCH 3/7] use ssml with litellm vertex

---
 docs/my-website/docs/providers/vertex.md | 57 +++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
index 697aa0701..8912133b8 100644
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@@ -1768,7 +1768,7 @@ LiteLLM supports calling [Vertex AI Text to Speech API](https://console.cloud.go
 
 
 
-Usage
+### Usage - Basic
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
@@ -1841,6 +1841,61 @@ print("response from proxy", response)
 </Tabs>
 
 
+### Usage - `ssml` as input
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param
+
+**Sync Usage**
+
+```python
+speech_file_path = Path(__file__).parent / "speech_vertex.mp3"
+response = litellm.speech(
+    input=None,
+    model="vertex_ai/test",
+    ssml="async hello what llm guardrail do you have",
+    voice={
+        "languageCode": "en-UK",
+        "name": "en-UK-Studio-O",
+    },
+    audioConfig={
+        "audioEncoding": "LINEAR22",
+        "speakingRate": "10",
+    },
+)
+response.stream_to_file(speech_file_path)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="LiteLLM PROXY (Unified Endpoint)">
+
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# see supported values for "voice" on vertex here: 
+# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
+response = client.audio.speech.create(
+    model = "vertex-tts",
+    input=None, # pass as None since OpenAI SDK requires this param
+    voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'},
+    extra_body={
+        "ssml": "async hello what llm guardrail do you have"
+    }
+)
+print("response from proxy", response)
+```
+
+</TabItem>
+</Tabs>
+
+
+
 ## Extra
 
 ### Using `GOOGLE_APPLICATION_CREDENTIALS`

From 3a4c7f20784e81f11e87a6ceff34ef2c3b0b38c8 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:32:51 -0700
Subject: [PATCH 4/7] add ssml support on docs

---
 docs/my-website/docs/providers/vertex.md | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
index 8912133b8..1e4f106a7 100644
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@@ -1853,10 +1853,19 @@ Vertex AI does not support passing a `model` param - so passing `model=vertex_ai
 
 ```python
 speech_file_path = Path(__file__).parent / "speech_vertex.mp3"
+
+
+ssml = """
+<speak>
+    <p>Hello, world!</p>
+    <p>This is a test of the <break strength="medium" /> text-to-speech API.</p>
+</speak>
+"""
+
 response = litellm.speech(
     input=None,
     model="vertex_ai/test",
-    ssml="async hello what llm guardrail do you have",
+    ssml=ssml,
     voice={
         "languageCode": "en-UK",
         "name": "en-UK-Studio-O",
@@ -1878,6 +1887,13 @@ import openai
 
 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
 
+ssml = """
+<speak>
+    <p>Hello, world!</p>
+    <p>This is a test of the <break strength="medium" /> text-to-speech API.</p>
+</speak>
+"""
+
 # see supported values for "voice" on vertex here: 
 # https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
 response = client.audio.speech.create(
@@ -1885,7 +1901,7 @@ response = client.audio.speech.create(
     input=None, # pass as None since OpenAI SDK requires this param
     voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'},
     extra_body={
-        "ssml": "async hello what llm guardrail do you have"
+        "ssml": ssml
     }
 )
 print("response from proxy", response)

From 58506dbade63d1b9a940c04f1fc33022bce4d7ec Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:52:26 -0700
Subject: [PATCH 5/7] update validate_vertex_input

---
 litellm/llms/text_to_speech/vertex_ai.py | 38 +++++++++++++++++++-----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py
index 0b6b9deb8..39c5ddfd4 100644
--- a/litellm/llms/text_to_speech/vertex_ai.py
+++ b/litellm/llms/text_to_speech/vertex_ai.py
@@ -88,10 +88,12 @@ class VertexTextToSpeechAPI(VertexLLM):
         ####### Build the request ################
         # API Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize
         kwargs = kwargs or {}
-        vertex_input = VertexInput(text=input, ssml=kwargs.get("ssml", None))
-        validate_vertex_input(vertex_input)
-        # required param
         optional_params = optional_params or {}
+
+        vertex_input = VertexInput(text=input)
+        validate_vertex_input(vertex_input, kwargs, optional_params)
+
+        # required param
         if voice is not None:
             vertex_voice = VertexVoice(**voice)
         elif "voice" in kwargs:
@@ -207,10 +209,32 @@ class VertexTextToSpeechAPI(VertexLLM):
         return http_binary_response
 
 
-def validate_vertex_input(input_data: VertexInput) -> None:
-    if input_data.get("text", None) is None:
-        input_data.pop("text")
-    if "text" not in input_data and "ssml" not in input_data:
+def validate_vertex_input(
+    input_data: VertexInput, kwargs: dict, optional_params: dict
+) -> None:
+    # Remove None values
+    if input_data.get("text") is None:
+        input_data.pop("text", None)
+    if input_data.get("ssml") is None:
+        input_data.pop("ssml", None)
+
+    # Check if use_ssml is set
+    use_ssml = kwargs.get("use_ssml", optional_params.get("use_ssml", False))
+
+    if use_ssml:
+        if "text" in input_data:
+            input_data["ssml"] = input_data.pop("text")
+        elif "ssml" not in input_data:
+            raise ValueError("SSML input is required when use_ssml is True.")
+    else:
+        # LiteLLM will auto-detect if text is in ssml format
+        # check if "text" is an ssml - in this case we should pass it as ssml instead of text
+        if input_data:
+            _text = input_data.get("text", None) or ""
+            if "<speak>" in _text:
+                input_data["ssml"] = input_data.pop("text")
+
+    if not input_data:
         raise ValueError("Either 'text' or 'ssml' must be provided.")
     if "text" in input_data and "ssml" in input_data:
         raise ValueError("Only one of 'text' or 'ssml' should be provided, not both.")

From 5ea27bdea99e0ada8f15f9956e7b46701a30d162 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 12:54:23 -0700
Subject: [PATCH 6/7] simpify ssml usage

---
 litellm/tests/test_audio_speech.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py
index d9ed3fd6e..2c710d37e 100644
--- a/litellm/tests/test_audio_speech.py
+++ b/litellm/tests/test_audio_speech.py
@@ -258,6 +258,13 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
     mock_response.json = return_val
     mock_response.status_code = 200
 
+    ssml = """
+    <speak>
+        <p>Hello, world!</p>
+        <p>This is a test of the <break strength="medium" /> text-to-speech API.</p>
+    </speak>
+    """
+
     # Set up the mock for asynchronous calls
     with patch(
         "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
@@ -267,9 +274,8 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
         model = "vertex_ai/test"
 
         response = await litellm.aspeech(
-            input=None,
+            input=ssml,
             model=model,
-            ssml="async hello what llm guardrail do you have",
             voice={
                 "languageCode": "en-UK",
                 "name": "en-UK-Studio-O",
@@ -291,7 +297,7 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
         assert kwargs["headers"]["Authorization"] is not None
 
         assert kwargs["json"] == {
-            "input": {"ssml": "async hello what llm guardrail do you have"},
+            "input": {"ssml": ssml},
             "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"},
             "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"},
         }

From 1e12a50cb38749895f88ea585d5008fc75dc76c7 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 28 Aug 2024 13:08:49 -0700
Subject: [PATCH 7/7] doc add ssml usage

---
 docs/my-website/docs/providers/vertex.md | 87 ++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 7 deletions(-)

diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
index 1e4f106a7..582636630 100644
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@@ -1843,13 +1843,15 @@ print("response from proxy", response)
 
 ### Usage - `ssml` as input
 
+Pass your `ssml` as input to the `input` param, if it contains `<speak>`, it will be automatically detected and passed as `ssml` to the Vertex AI API
+
+If you need to force your `input` to be passed as `ssml`, set `use_ssml=True`
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 
 Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param
 
-**Sync Usage**
 
 ```python
 speech_file_path = Path(__file__).parent / "speech_vertex.mp3"
@@ -1863,9 +1865,8 @@ ssml = """
 """
 
 response = litellm.speech(
-    input=None,
+    input=ssml,
     model="vertex_ai/test",
-    ssml=ssml,
     voice={
         "languageCode": "en-UK",
         "name": "en-UK-Studio-O",
@@ -1898,11 +1899,8 @@ ssml = """
 # https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
 response = client.audio.speech.create(
     model = "vertex-tts",
-    input=None, # pass as None since OpenAI SDK requires this param
+    input=ssml,
     voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'},
-    extra_body={
-        "ssml": ssml
-    }
 )
 print("response from proxy", response)
 ```
@@ -1911,6 +1909,81 @@ print("response from proxy", response)
 </Tabs>
 
 
+### Forcing SSML Usage
+
+You can force the use of SSML by setting the `use_ssml` parameter to `True`. This is useful when you want to ensure that your input is treated as SSML, even if it doesn't contain the `<speak>` tags.
+
+Here are examples of how to force SSML usage:
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param
+
+
+```python
+speech_file_path = Path(__file__).parent / "speech_vertex.mp3"
+
+
+ssml = """
+<speak>
+    <p>Hello, world!</p>
+    <p>This is a test of the <break strength="medium" /> text-to-speech API.</p>
+</speak>
+"""
+
+response = litellm.speech(
+    input=ssml,
+    use_ssml=True,
+    model="vertex_ai/test",
+    voice={
+        "languageCode": "en-UK",
+        "name": "en-UK-Studio-O",
+    },
+    audioConfig={
+        "audioEncoding": "LINEAR22",
+        "speakingRate": "10",
+    },
+)
+response.stream_to_file(speech_file_path)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="LiteLLM PROXY (Unified Endpoint)">
+
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+ssml = """
+<speak>
+    <p>Hello, world!</p>
+    <p>This is a test of the <break strength="medium" /> text-to-speech API.</p>
+</speak>
+"""
+
+# see supported values for "voice" on vertex here: 
+# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
+response = client.audio.speech.create(
+    model = "vertex-tts",
+    input=ssml, # pass as None since OpenAI SDK requires this param
+    voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'},
+    extra_body={"use_ssml": True},
+)
+print("response from proxy", response)
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+
 
 ## Extra