From 649c3bb0ddcf014bc397c521c97f7058bcee3740 Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 11 Apr 2024 22:49:25 +0000 Subject: [PATCH 1/4] (feat) - Add support for JSON mode in Vertex AI --- litellm/llms/vertex_ai.py | 13 +++++++++++++ litellm/utils.py | 12 ++++++++++++ 2 files changed, 25 insertions(+) diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index ddfc8e9fb..9a5e92828 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -25,6 +25,7 @@ class VertexAIError(Exception): class VertexAIConfig: """ Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts + Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters: @@ -36,6 +37,12 @@ class VertexAIConfig: - `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40. + - `response_mime_type` (str): The MIME type of the response. The default value is 'text/plain'. + + - `candidate_count` (int): Number of generated responses to return. + + - `stop_sequences` (List[str]): The set of character sequences (up to 5) that will stop output generation. If specified, the API will stop at the first appearance of a stop sequence. The stop sequence will not be included as part of the response. + Note: Please make sure to modify the default parameters as required for your use case. """ @@ -43,6 +50,9 @@ class VertexAIConfig: max_output_tokens: Optional[int] = None top_p: Optional[float] = None top_k: Optional[int] = None + response_mime_type: Optional[str] = None + candidate_count: Optional[int] = None + stop_sequences: Optional[list] = None def __init__( self, @@ -50,6 +60,9 @@ class VertexAIConfig: max_output_tokens: Optional[int] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, + response_mime_type: Optional[str] = None, + candidate_count: Optional[int] = None, + stop_sequences: Optional[list] = None, ) -> None: locals_ = locals() for key, value in locals_.items(): diff --git a/litellm/utils.py b/litellm/utils.py index 7540367d9..7bef853fe 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4840,8 +4840,17 @@ def get_optional_params( optional_params["top_p"] = top_p if stream: optional_params["stream"] = stream + if n is not None: + optional_params["candidate_count"] = n + if stop is not None: + if isinstance(stop, str): + optional_params["stop_sequences"] = [stop] + elif isinstance(stop, list): + optional_params["stop_sequences"] = stop if max_tokens is not None: optional_params["max_output_tokens"] = max_tokens + if response_format is not None and response_format["type"] == "json_object": + optional_params["response_mime_type"] = "application/json" if tools is not None and isinstance(tools, list): from vertexai.preview import generative_models @@ -5528,6 +5537,9 @@ def get_supported_openai_params(model: str, custom_llm_provider: str): "stream", "tools", "tool_choice", + "response_format", + "n", + "stop", ] elif custom_llm_provider == "sagemaker": return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"] From d08674bf2f0152c107b9bb60ce18bcf5dcfc6d5f Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 11 Apr 2024 23:33:59 +0000 Subject: [PATCH 2/4] (feat) - Dirty hack to get response_mime_type working before it's released in the Python SDK. --- litellm/llms/vertex_ai.py | 82 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index 9a5e92828..3bd4579e4 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -3,7 +3,7 @@ import json from enum import Enum import requests import time -from typing import Callable, Optional, Union +from typing import Callable, Optional, Union, List from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason import litellm, uuid import httpx @@ -308,6 +308,30 @@ def completion( from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore import google.auth # type: ignore + class ExtendedGenerationConfig(GenerationConfig): + """Extended parameters for the generation.""" + + def __init__( + self, + *, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + candidate_count: Optional[int] = None, + max_output_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + response_mime_type: Optional[str] = None, + ): + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + candidate_count=candidate_count, + max_output_tokens=max_output_tokens, + stop_sequences=stop_sequences, + ) + self.response_mime_type = response_mime_type + ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744 print_verbose( f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}" @@ -449,7 +473,7 @@ def completion( model_response = llm_model.generate_content( contents=content, - generation_config=GenerationConfig(**optional_params), + generation_config=ExtendedGenerationConfig(**optional_params), safety_settings=safety_settings, stream=True, tools=tools, @@ -471,7 +495,7 @@ def completion( ## LLM Call response = llm_model.generate_content( contents=content, - generation_config=GenerationConfig(**optional_params), + generation_config=ExtendedGenerationConfig(**optional_params), safety_settings=safety_settings, tools=tools, ) @@ -712,6 +736,30 @@ async def async_completion( try: from vertexai.preview.generative_models import GenerationConfig + class ExtendedGenerationConfig(GenerationConfig): + """Extended parameters for the generation.""" + + def __init__( + self, + *, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + candidate_count: Optional[int] = None, + max_output_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + response_mime_type: Optional[str] = None, + ): + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + candidate_count=candidate_count, + max_output_tokens=max_output_tokens, + stop_sequences=stop_sequences, + ) + self.response_mime_type = response_mime_type + if mode == "vision": print_verbose("\nMaking VertexAI Gemini Pro Vision Call") print_verbose(f"\nProcessing input messages = {messages}") @@ -734,7 +782,7 @@ async def async_completion( ## LLM Call response = await llm_model._generate_content_async( contents=content, - generation_config=GenerationConfig(**optional_params), + generation_config=ExtendedGenerationConfig(**optional_params), tools=tools, ) @@ -920,6 +968,30 @@ async def async_streaming( """ from vertexai.preview.generative_models import GenerationConfig + class ExtendedGenerationConfig(GenerationConfig): + """Extended parameters for the generation.""" + + def __init__( + self, + *, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + candidate_count: Optional[int] = None, + max_output_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + response_mime_type: Optional[str] = None, + ): + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + candidate_count=candidate_count, + max_output_tokens=max_output_tokens, + stop_sequences=stop_sequences, + ) + self.response_mime_type = response_mime_type + if mode == "vision": stream = optional_params.pop("stream") tools = optional_params.pop("tools", None) @@ -940,7 +1012,7 @@ async def async_streaming( response = await llm_model._generate_content_streaming_async( contents=content, - generation_config=GenerationConfig(**optional_params), + generation_config=ExtendedGenerationConfig(**optional_params), tools=tools, ) optional_params["stream"] = True From 05350037bee54d35c314e863724f1421643a1458 Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 11 Apr 2024 23:45:41 +0000 Subject: [PATCH 3/4] (feat) - Extreme dirty hack for response_mime_type in Vertex AI. --- litellm/llms/vertex_ai.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index 3bd4579e4..0fff09aa1 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -322,15 +322,15 @@ def completion( stop_sequences: Optional[List[str]] = None, response_mime_type: Optional[str] = None, ): - super().__init__( + self._raw_generation_config = gapic_content_types.GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, candidate_count=candidate_count, max_output_tokens=max_output_tokens, stop_sequences=stop_sequences, + response_mime_type=response_mime_type, ) - self.response_mime_type = response_mime_type ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744 print_verbose( @@ -735,6 +735,7 @@ async def async_completion( """ try: from vertexai.preview.generative_models import GenerationConfig + from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore class ExtendedGenerationConfig(GenerationConfig): """Extended parameters for the generation.""" @@ -750,15 +751,15 @@ async def async_completion( stop_sequences: Optional[List[str]] = None, response_mime_type: Optional[str] = None, ): - super().__init__( + self._raw_generation_config = gapic_content_types.GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, candidate_count=candidate_count, max_output_tokens=max_output_tokens, stop_sequences=stop_sequences, + response_mime_type=response_mime_type, ) - self.response_mime_type = response_mime_type if mode == "vision": print_verbose("\nMaking VertexAI Gemini Pro Vision Call") @@ -967,6 +968,7 @@ async def async_streaming( Add support for async streaming calls for gemini-pro """ from vertexai.preview.generative_models import GenerationConfig + from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore class ExtendedGenerationConfig(GenerationConfig): """Extended parameters for the generation.""" @@ -982,15 +984,15 @@ async def async_streaming( stop_sequences: Optional[List[str]] = None, response_mime_type: Optional[str] = None, ): - super().__init__( + self._raw_generation_config = gapic_content_types.GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, candidate_count=candidate_count, max_output_tokens=max_output_tokens, stop_sequences=stop_sequences, + response_mime_type=response_mime_type, ) - self.response_mime_type = response_mime_type if mode == "vision": stream = optional_params.pop("stream") From 9c55be3e8246a22069cd869b3febe0b96e9bf48c Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 11 Apr 2024 23:54:55 +0000 Subject: [PATCH 4/4] (feat) - Bump version for Vertex AI SDK. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f09dd7501..2acb41165 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ pandas==2.1.1 # for viewing clickhouse spend analytics prisma==0.11.0 # for db mangum==0.17.0 # for aws lambda functions pynacl==1.5.0 # for encrypting keys -google-cloud-aiplatform==1.43.0 # for vertex ai calls +google-cloud-aiplatform==1.47.0 # for vertex ai calls anthropic[vertex]==0.21.3 google-generativeai==0.3.2 # for vertex ai calls async_generator==1.10.0 # for async ollama calls