From 649c3bb0ddcf014bc397c521c97f7058bcee3740 Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 11 Apr 2024 22:49:25 +0000
Subject: [PATCH 1/4] (feat) - Add support for JSON mode in Vertex AI

---
 litellm/llms/vertex_ai.py | 13 +++++++++++++
 litellm/utils.py          | 12 ++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py
index ddfc8e9fb..9a5e92828 100644
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@@ -25,6 +25,7 @@ class VertexAIError(Exception):
 class VertexAIConfig:
     """
     Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts
+    Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
 
     The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters:
 
@@ -36,6 +37,12 @@ class VertexAIConfig:
 
     - `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40.
 
+    - `response_mime_type` (str): The MIME type of the response. The default value is 'text/plain'.
+
+    - `candidate_count` (int): Number of generated responses to return.
+
+    - `stop_sequences` (List[str]): The set of character sequences (up to 5) that will stop output generation. If specified, the API will stop at the first appearance of a stop sequence. The stop sequence will not be included as part of the response.
+
     Note: Please make sure to modify the default parameters as required for your use case.
     """
 
@@ -43,6 +50,9 @@ class VertexAIConfig:
     max_output_tokens: Optional[int] = None
     top_p: Optional[float] = None
     top_k: Optional[int] = None
+    response_mime_type: Optional[str] = None
+    candidate_count: Optional[int] = None
+    stop_sequences: Optional[list] = None
 
     def __init__(
         self,
@@ -50,6 +60,9 @@ class VertexAIConfig:
         max_output_tokens: Optional[int] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
+        response_mime_type: Optional[str] = None,
+        candidate_count: Optional[int] = None,
+        stop_sequences: Optional[list] = None,
     ) -> None:
         locals_ = locals()
         for key, value in locals_.items():
diff --git a/litellm/utils.py b/litellm/utils.py
index 7540367d9..7bef853fe 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -4840,8 +4840,17 @@ def get_optional_params(
             optional_params["top_p"] = top_p
         if stream:
             optional_params["stream"] = stream
+        if n is not None:
+            optional_params["candidate_count"] = n
+        if stop is not None:
+            if isinstance(stop, str):
+                optional_params["stop_sequences"] = [stop]
+            elif isinstance(stop, list):
+                optional_params["stop_sequences"] = stop
         if max_tokens is not None:
             optional_params["max_output_tokens"] = max_tokens
+        if response_format is not None and response_format["type"] == "json_object":
+            optional_params["response_mime_type"] = "application/json"
         if tools is not None and isinstance(tools, list):
             from vertexai.preview import generative_models
 
@@ -5528,6 +5537,9 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
             "stream",
             "tools",
             "tool_choice",
+            "response_format",
+            "n",
+            "stop",
         ]
     elif custom_llm_provider == "sagemaker":
         return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]

From d08674bf2f0152c107b9bb60ce18bcf5dcfc6d5f Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 11 Apr 2024 23:33:59 +0000
Subject: [PATCH 2/4] (feat) - Dirty hack to get response_mime_type working
 before it's released in the Python SDK.

---
 litellm/llms/vertex_ai.py | 82 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 5 deletions(-)

diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py
index 9a5e92828..3bd4579e4 100644
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@@ -3,7 +3,7 @@ import json
 from enum import Enum
 import requests
 import time
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Union, List
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
 import httpx
@@ -308,6 +308,30 @@ def completion(
         from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
         import google.auth  # type: ignore
 
+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                super().__init__(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                )
+                self.response_mime_type = response_mime_type
+
         ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
         print_verbose(
             f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}"
@@ -449,7 +473,7 @@ def completion(
 
                 model_response = llm_model.generate_content(
                     contents=content,
-                    generation_config=GenerationConfig(**optional_params),
+                    generation_config=ExtendedGenerationConfig(**optional_params),
                     safety_settings=safety_settings,
                     stream=True,
                     tools=tools,
@@ -471,7 +495,7 @@ def completion(
             ## LLM Call
             response = llm_model.generate_content(
                 contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                 safety_settings=safety_settings,
                 tools=tools,
             )
@@ -712,6 +736,30 @@ async def async_completion(
     try:
         from vertexai.preview.generative_models import GenerationConfig
 
+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                super().__init__(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                )
+                self.response_mime_type = response_mime_type
+
         if mode == "vision":
             print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
             print_verbose(f"\nProcessing input messages = {messages}")
@@ -734,7 +782,7 @@ async def async_completion(
             ## LLM Call
             response = await llm_model._generate_content_async(
                 contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                 tools=tools,
             )
 
@@ -920,6 +968,30 @@ async def async_streaming(
     """
     from vertexai.preview.generative_models import GenerationConfig
 
+    class ExtendedGenerationConfig(GenerationConfig):
+        """Extended parameters for the generation."""
+
+        def __init__(
+            self,
+            *,
+            temperature: Optional[float] = None,
+            top_p: Optional[float] = None,
+            top_k: Optional[int] = None,
+            candidate_count: Optional[int] = None,
+            max_output_tokens: Optional[int] = None,
+            stop_sequences: Optional[List[str]] = None,
+            response_mime_type: Optional[str] = None,
+        ):
+            super().__init__(
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                candidate_count=candidate_count,
+                max_output_tokens=max_output_tokens,
+                stop_sequences=stop_sequences,
+            )
+            self.response_mime_type = response_mime_type
+
     if mode == "vision":
         stream = optional_params.pop("stream")
         tools = optional_params.pop("tools", None)
@@ -940,7 +1012,7 @@ async def async_streaming(
 
         response = await llm_model._generate_content_streaming_async(
             contents=content,
-            generation_config=GenerationConfig(**optional_params),
+            generation_config=ExtendedGenerationConfig(**optional_params),
             tools=tools,
         )
         optional_params["stream"] = True

From 05350037bee54d35c314e863724f1421643a1458 Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 11 Apr 2024 23:45:41 +0000
Subject: [PATCH 3/4] (feat) - Extreme dirty hack for response_mime_type in
 Vertex AI.

---
 litellm/llms/vertex_ai.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py
index 3bd4579e4..0fff09aa1 100644
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@@ -322,15 +322,15 @@ def completion(
                 stop_sequences: Optional[List[str]] = None,
                 response_mime_type: Optional[str] = None,
             ):
-                super().__init__(
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
                     temperature=temperature,
                     top_p=top_p,
                     top_k=top_k,
                     candidate_count=candidate_count,
                     max_output_tokens=max_output_tokens,
                     stop_sequences=stop_sequences,
+                    response_mime_type=response_mime_type,
                 )
-                self.response_mime_type = response_mime_type
 
         ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
         print_verbose(
@@ -735,6 +735,7 @@ async def async_completion(
     """
     try:
         from vertexai.preview.generative_models import GenerationConfig
+        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
 
         class ExtendedGenerationConfig(GenerationConfig):
             """Extended parameters for the generation."""
@@ -750,15 +751,15 @@ async def async_completion(
                 stop_sequences: Optional[List[str]] = None,
                 response_mime_type: Optional[str] = None,
             ):
-                super().__init__(
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
                     temperature=temperature,
                     top_p=top_p,
                     top_k=top_k,
                     candidate_count=candidate_count,
                     max_output_tokens=max_output_tokens,
                     stop_sequences=stop_sequences,
+                    response_mime_type=response_mime_type,
                 )
-                self.response_mime_type = response_mime_type
 
         if mode == "vision":
             print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@@ -967,6 +968,7 @@ async def async_streaming(
     Add support for async streaming calls for gemini-pro
     """
     from vertexai.preview.generative_models import GenerationConfig
+    from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
 
     class ExtendedGenerationConfig(GenerationConfig):
         """Extended parameters for the generation."""
@@ -982,15 +984,15 @@ async def async_streaming(
             stop_sequences: Optional[List[str]] = None,
             response_mime_type: Optional[str] = None,
         ):
-            super().__init__(
+            self._raw_generation_config = gapic_content_types.GenerationConfig(
                 temperature=temperature,
                 top_p=top_p,
                 top_k=top_k,
                 candidate_count=candidate_count,
                 max_output_tokens=max_output_tokens,
                 stop_sequences=stop_sequences,
+                response_mime_type=response_mime_type,
             )
-            self.response_mime_type = response_mime_type
 
     if mode == "vision":
         stream = optional_params.pop("stream")

From 9c55be3e8246a22069cd869b3febe0b96e9bf48c Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 11 Apr 2024 23:54:55 +0000
Subject: [PATCH 4/4] (feat) - Bump version for Vertex AI SDK.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f09dd7501..2acb41165 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ pandas==2.1.1 # for viewing clickhouse spend analytics
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 pynacl==1.5.0 # for encrypting keys
-google-cloud-aiplatform==1.43.0 # for vertex ai calls
+google-cloud-aiplatform==1.47.0 # for vertex ai calls
 anthropic[vertex]==0.21.3
 google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls