From 6d11b392f8786ccb287c543f5b02e655db93085a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:17:53 -0700 Subject: [PATCH 1/7] add ssml input on vertex tts --- litellm/llms/text_to_speech/vertex_ai.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py index b9fca5325..0b6b9deb8 100644 --- a/litellm/llms/text_to_speech/vertex_ai.py +++ b/litellm/llms/text_to_speech/vertex_ai.py @@ -19,7 +19,8 @@ from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gem class VertexInput(TypedDict, total=False): - text: str + text: Optional[str] + ssml: Optional[str] class VertexVoice(TypedDict, total=False): @@ -86,10 +87,11 @@ class VertexTextToSpeechAPI(VertexLLM): ####### Build the request ################ # API Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize - vertex_input = VertexInput(text=input) + kwargs = kwargs or {} + vertex_input = VertexInput(text=input, ssml=kwargs.get("ssml", None)) + validate_vertex_input(vertex_input) # required param optional_params = optional_params or {} - kwargs = kwargs or {} if voice is not None: vertex_voice = VertexVoice(**voice) elif "voice" in kwargs: @@ -203,3 +205,12 @@ class VertexTextToSpeechAPI(VertexLLM): # Initialize the HttpxBinaryResponseContent instance http_binary_response = HttpxBinaryResponseContent(response) return http_binary_response + + +def validate_vertex_input(input_data: VertexInput) -> None: + if input_data.get("text", None) is None: + input_data.pop("text") + if "text" not in input_data and "ssml" not in input_data: + raise ValueError("Either 'text' or 'ssml' must be provided.") + if "text" in input_data and "ssml" in input_data: + raise ValueError("Only one of 'text' or 'ssml' should be provided, not both.") From 52f098ef434f9ccd6ae631277aa8741febbaec7b Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:18:34 -0700 Subject: [PATCH 2/7] add vertex ssml test --- litellm/tests/test_audio_speech.py | 52 ++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py index 5de996fa1..d9ed3fd6e 100644 --- a/litellm/tests/test_audio_speech.py +++ b/litellm/tests/test_audio_speech.py @@ -243,3 +243,55 @@ async def test_speech_litellm_vertex_async_with_voice(): "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"}, "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"}, } + + +@pytest.mark.asyncio +async def test_speech_litellm_vertex_async_with_voice_ssml(): + # Mock the response + mock_response = AsyncMock() + + def return_val(): + return { + "audioContent": "dGVzdCByZXNwb25zZQ==", + } + + mock_response.json = return_val + mock_response.status_code = 200 + + # Set up the mock for asynchronous calls + with patch( + "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", + new_callable=AsyncMock, + ) as mock_async_post: + mock_async_post.return_value = mock_response + model = "vertex_ai/test" + + response = await litellm.aspeech( + input=None, + model=model, + ssml="async hello what llm guardrail do you have", + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, + ) + + # Assert asynchronous call + mock_async_post.assert_called_once() + _, kwargs = mock_async_post.call_args + print("call args", kwargs) + + assert kwargs["url"] == "https://texttospeech.googleapis.com/v1/text:synthesize" + + assert "x-goog-user-project" in kwargs["headers"] + assert kwargs["headers"]["Authorization"] is not None + + assert kwargs["json"] == { + "input": {"ssml": "async hello what llm guardrail do you have"}, + "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"}, + "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"}, + } From 043919b27856c63b59c92c14f4e34fdc0d38c5f9 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:27:44 -0700 Subject: [PATCH 3/7] use ssml with litellm vertex --- docs/my-website/docs/providers/vertex.md | 57 +++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 697aa0701..8912133b8 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -1768,7 +1768,7 @@ LiteLLM supports calling [Vertex AI Text to Speech API](https://console.cloud.go -Usage +### Usage - Basic @@ -1841,6 +1841,61 @@ print("response from proxy", response) +### Usage - `ssml` as input + + + + + +Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param + +**Sync Usage** + +```python +speech_file_path = Path(__file__).parent / "speech_vertex.mp3" +response = litellm.speech( + input=None, + model="vertex_ai/test", + ssml="async hello what llm guardrail do you have", + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, +) +response.stream_to_file(speech_file_path) +``` + + + + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# see supported values for "voice" on vertex here: +# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech +response = client.audio.speech.create( + model = "vertex-tts", + input=None, # pass as None since OpenAI SDK requires this param + voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, + extra_body={ + "ssml": "async hello what llm guardrail do you have" + } +) +print("response from proxy", response) +``` + + + + + + ## Extra ### Using `GOOGLE_APPLICATION_CREDENTIALS` From 3a4c7f20784e81f11e87a6ceff34ef2c3b0b38c8 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:32:51 -0700 Subject: [PATCH 4/7] add ssml support on docs --- docs/my-website/docs/providers/vertex.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 8912133b8..1e4f106a7 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -1853,10 +1853,19 @@ Vertex AI does not support passing a `model` param - so passing `model=vertex_ai ```python speech_file_path = Path(__file__).parent / "speech_vertex.mp3" + + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + response = litellm.speech( input=None, model="vertex_ai/test", - ssml="async hello what llm guardrail do you have", + ssml=ssml, voice={ "languageCode": "en-UK", "name": "en-UK-Studio-O", @@ -1878,6 +1887,13 @@ import openai client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + # see supported values for "voice" on vertex here: # https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech response = client.audio.speech.create( @@ -1885,7 +1901,7 @@ response = client.audio.speech.create( input=None, # pass as None since OpenAI SDK requires this param voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, extra_body={ - "ssml": "async hello what llm guardrail do you have" + "ssml": ssml } ) print("response from proxy", response) From 58506dbade63d1b9a940c04f1fc33022bce4d7ec Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:52:26 -0700 Subject: [PATCH 5/7] update validate_vertex_input --- litellm/llms/text_to_speech/vertex_ai.py | 38 +++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py index 0b6b9deb8..39c5ddfd4 100644 --- a/litellm/llms/text_to_speech/vertex_ai.py +++ b/litellm/llms/text_to_speech/vertex_ai.py @@ -88,10 +88,12 @@ class VertexTextToSpeechAPI(VertexLLM): ####### Build the request ################ # API Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize kwargs = kwargs or {} - vertex_input = VertexInput(text=input, ssml=kwargs.get("ssml", None)) - validate_vertex_input(vertex_input) - # required param optional_params = optional_params or {} + + vertex_input = VertexInput(text=input) + validate_vertex_input(vertex_input, kwargs, optional_params) + + # required param if voice is not None: vertex_voice = VertexVoice(**voice) elif "voice" in kwargs: @@ -207,10 +209,32 @@ class VertexTextToSpeechAPI(VertexLLM): return http_binary_response -def validate_vertex_input(input_data: VertexInput) -> None: - if input_data.get("text", None) is None: - input_data.pop("text") - if "text" not in input_data and "ssml" not in input_data: +def validate_vertex_input( + input_data: VertexInput, kwargs: dict, optional_params: dict +) -> None: + # Remove None values + if input_data.get("text") is None: + input_data.pop("text", None) + if input_data.get("ssml") is None: + input_data.pop("ssml", None) + + # Check if use_ssml is set + use_ssml = kwargs.get("use_ssml", optional_params.get("use_ssml", False)) + + if use_ssml: + if "text" in input_data: + input_data["ssml"] = input_data.pop("text") + elif "ssml" not in input_data: + raise ValueError("SSML input is required when use_ssml is True.") + else: + # LiteLLM will auto-detect if text is in ssml format + # check if "text" is an ssml - in this case we should pass it as ssml instead of text + if input_data: + _text = input_data.get("text", None) or "" + if "" in _text: + input_data["ssml"] = input_data.pop("text") + + if not input_data: raise ValueError("Either 'text' or 'ssml' must be provided.") if "text" in input_data and "ssml" in input_data: raise ValueError("Only one of 'text' or 'ssml' should be provided, not both.") From 5ea27bdea99e0ada8f15f9956e7b46701a30d162 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 12:54:23 -0700 Subject: [PATCH 6/7] simpify ssml usage --- litellm/tests/test_audio_speech.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py index d9ed3fd6e..2c710d37e 100644 --- a/litellm/tests/test_audio_speech.py +++ b/litellm/tests/test_audio_speech.py @@ -258,6 +258,13 @@ async def test_speech_litellm_vertex_async_with_voice_ssml(): mock_response.json = return_val mock_response.status_code = 200 + ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+ """ + # Set up the mock for asynchronous calls with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", @@ -267,9 +274,8 @@ async def test_speech_litellm_vertex_async_with_voice_ssml(): model = "vertex_ai/test" response = await litellm.aspeech( - input=None, + input=ssml, model=model, - ssml="async hello what llm guardrail do you have", voice={ "languageCode": "en-UK", "name": "en-UK-Studio-O", @@ -291,7 +297,7 @@ async def test_speech_litellm_vertex_async_with_voice_ssml(): assert kwargs["headers"]["Authorization"] is not None assert kwargs["json"] == { - "input": {"ssml": "async hello what llm guardrail do you have"}, + "input": {"ssml": ssml}, "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"}, "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"}, } From 1e12a50cb38749895f88ea585d5008fc75dc76c7 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 28 Aug 2024 13:08:49 -0700 Subject: [PATCH 7/7] doc add ssml usage --- docs/my-website/docs/providers/vertex.md | 87 ++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 7 deletions(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 1e4f106a7..582636630 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -1843,13 +1843,15 @@ print("response from proxy", response) ### Usage - `ssml` as input +Pass your `ssml` as input to the `input` param, if it contains ``, it will be automatically detected and passed as `ssml` to the Vertex AI API + +If you need to force your `input` to be passed as `ssml`, set `use_ssml=True` Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param -**Sync Usage** ```python speech_file_path = Path(__file__).parent / "speech_vertex.mp3" @@ -1863,9 +1865,8 @@ ssml = """ """ response = litellm.speech( - input=None, + input=ssml, model="vertex_ai/test", - ssml=ssml, voice={ "languageCode": "en-UK", "name": "en-UK-Studio-O", @@ -1898,11 +1899,8 @@ ssml = """ # https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech response = client.audio.speech.create( model = "vertex-tts", - input=None, # pass as None since OpenAI SDK requires this param + input=ssml, voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, - extra_body={ - "ssml": ssml - } ) print("response from proxy", response) ``` @@ -1911,6 +1909,81 @@ print("response from proxy", response) +### Forcing SSML Usage + +You can force the use of SSML by setting the `use_ssml` parameter to `True`. This is useful when you want to ensure that your input is treated as SSML, even if it doesn't contain the `` tags. + +Here are examples of how to force SSML usage: + + + + + +Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param + + +```python +speech_file_path = Path(__file__).parent / "speech_vertex.mp3" + + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +response = litellm.speech( + input=ssml, + use_ssml=True, + model="vertex_ai/test", + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, +) +response.stream_to_file(speech_file_path) +``` + +
+ + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +# see supported values for "voice" on vertex here: +# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech +response = client.audio.speech.create( + model = "vertex-tts", + input=ssml, # pass as None since OpenAI SDK requires this param + voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, + extra_body={"use_ssml": True}, +) +print("response from proxy", response) +``` + +
+
+ + + + + + ## Extra