diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 697aa0701..582636630 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -1768,7 +1768,7 @@ LiteLLM supports calling [Vertex AI Text to Speech API](https://console.cloud.go -Usage +### Usage - Basic @@ -1841,6 +1841,150 @@ print("response from proxy", response) +### Usage - `ssml` as input + +Pass your `ssml` as input to the `input` param, if it contains ``, it will be automatically detected and passed as `ssml` to the Vertex AI API + +If you need to force your `input` to be passed as `ssml`, set `use_ssml=True` + + + + +Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param + + +```python +speech_file_path = Path(__file__).parent / "speech_vertex.mp3" + + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +response = litellm.speech( + input=ssml, + model="vertex_ai/test", + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, +) +response.stream_to_file(speech_file_path) +``` + +
+ + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +# see supported values for "voice" on vertex here: +# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech +response = client.audio.speech.create( + model = "vertex-tts", + input=ssml, + voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, +) +print("response from proxy", response) +``` + +
+
+ + +### Forcing SSML Usage + +You can force the use of SSML by setting the `use_ssml` parameter to `True`. This is useful when you want to ensure that your input is treated as SSML, even if it doesn't contain the `` tags. + +Here are examples of how to force SSML usage: + + + + + +Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param + + +```python +speech_file_path = Path(__file__).parent / "speech_vertex.mp3" + + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +response = litellm.speech( + input=ssml, + use_ssml=True, + model="vertex_ai/test", + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, +) +response.stream_to_file(speech_file_path) +``` + +
+ + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +# see supported values for "voice" on vertex here: +# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech +response = client.audio.speech.create( + model = "vertex-tts", + input=ssml, # pass as None since OpenAI SDK requires this param + voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, + extra_body={"use_ssml": True}, +) +print("response from proxy", response) +``` + +
+
+ + + + + + + ## Extra ### Using `GOOGLE_APPLICATION_CREDENTIALS` diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py index b9fca5325..39c5ddfd4 100644 --- a/litellm/llms/text_to_speech/vertex_ai.py +++ b/litellm/llms/text_to_speech/vertex_ai.py @@ -19,7 +19,8 @@ from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gem class VertexInput(TypedDict, total=False): - text: str + text: Optional[str] + ssml: Optional[str] class VertexVoice(TypedDict, total=False): @@ -86,10 +87,13 @@ class VertexTextToSpeechAPI(VertexLLM): ####### Build the request ################ # API Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize - vertex_input = VertexInput(text=input) - # required param - optional_params = optional_params or {} kwargs = kwargs or {} + optional_params = optional_params or {} + + vertex_input = VertexInput(text=input) + validate_vertex_input(vertex_input, kwargs, optional_params) + + # required param if voice is not None: vertex_voice = VertexVoice(**voice) elif "voice" in kwargs: @@ -203,3 +207,34 @@ class VertexTextToSpeechAPI(VertexLLM): # Initialize the HttpxBinaryResponseContent instance http_binary_response = HttpxBinaryResponseContent(response) return http_binary_response + + +def validate_vertex_input( + input_data: VertexInput, kwargs: dict, optional_params: dict +) -> None: + # Remove None values + if input_data.get("text") is None: + input_data.pop("text", None) + if input_data.get("ssml") is None: + input_data.pop("ssml", None) + + # Check if use_ssml is set + use_ssml = kwargs.get("use_ssml", optional_params.get("use_ssml", False)) + + if use_ssml: + if "text" in input_data: + input_data["ssml"] = input_data.pop("text") + elif "ssml" not in input_data: + raise ValueError("SSML input is required when use_ssml is True.") + else: + # LiteLLM will auto-detect if text is in ssml format + # check if "text" is an ssml - in this case we should pass it as ssml instead of text + if input_data: + _text = input_data.get("text", None) or "" + if "" in _text: + input_data["ssml"] = input_data.pop("text") + + if not input_data: + raise ValueError("Either 'text' or 'ssml' must be provided.") + if "text" in input_data and "ssml" in input_data: + raise ValueError("Only one of 'text' or 'ssml' should be provided, not both.") diff --git a/litellm/tests/test_audio_speech.py b/litellm/tests/test_audio_speech.py index 5de996fa1..2c710d37e 100644 --- a/litellm/tests/test_audio_speech.py +++ b/litellm/tests/test_audio_speech.py @@ -243,3 +243,61 @@ async def test_speech_litellm_vertex_async_with_voice(): "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"}, "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"}, } + + +@pytest.mark.asyncio +async def test_speech_litellm_vertex_async_with_voice_ssml(): + # Mock the response + mock_response = AsyncMock() + + def return_val(): + return { + "audioContent": "dGVzdCByZXNwb25zZQ==", + } + + mock_response.json = return_val + mock_response.status_code = 200 + + ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+ """ + + # Set up the mock for asynchronous calls + with patch( + "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", + new_callable=AsyncMock, + ) as mock_async_post: + mock_async_post.return_value = mock_response + model = "vertex_ai/test" + + response = await litellm.aspeech( + input=ssml, + model=model, + voice={ + "languageCode": "en-UK", + "name": "en-UK-Studio-O", + }, + audioConfig={ + "audioEncoding": "LINEAR22", + "speakingRate": "10", + }, + ) + + # Assert asynchronous call + mock_async_post.assert_called_once() + _, kwargs = mock_async_post.call_args + print("call args", kwargs) + + assert kwargs["url"] == "https://texttospeech.googleapis.com/v1/text:synthesize" + + assert "x-goog-user-project" in kwargs["headers"] + assert kwargs["headers"]["Authorization"] is not None + + assert kwargs["json"] == { + "input": {"ssml": ssml}, + "voice": {"languageCode": "en-UK", "name": "en-UK-Studio-O"}, + "audioConfig": {"audioEncoding": "LINEAR22", "speakingRate": "10"}, + }