Merge pull request #4845 from BerriAI/litellm_vertex_ai_llama3_1_api

feat(vertex_ai_llama.py): vertex ai llama3.1 api support
2024-07-23 21:51:46 -07:00 · 2024-07-23 21:51:46 -07:00 · 6c580ac8dc
commit 6c580ac8dc
parent fb0a13c8bb ae693424e4
9 changed files with 338 additions and 19 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -357,6 +357,7 @@ vertex_text_models: List = []
 vertex_code_text_models: List = []
 vertex_embedding_models: List = []
 vertex_anthropic_models: List = []
 vertex_llama3_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -399,6 +400,9 @@ for key, value in model_cost.items():
    elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
        key = key.replace("vertex_ai/", "")
        vertex_anthropic_models.append(key)
    elif value.get("litellm_provider") == "vertex_ai-llama_models":
        key = key.replace("vertex_ai/", "")
        vertex_llama3_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -828,6 +832,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
 from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.vertex_ai_llama import VertexAILlama3Config
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
--- a/litellm/llms/vertex_ai_llama.py
+++ b/litellm/llms/vertex_ai_llama.py
@ -0,0 +1,203 @@
 # What is this?
 ## Handler for calling llama 3.1 API on Vertex AI
 import copy
 import json
 import os
 import time
 import types
 import uuid
 from enum import Enum
 from typing import Any, Callable, List, Optional, Tuple, Union
 import httpx  # type: ignore
 import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.anthropic import (
    AnthropicMessagesTool,
    AnthropicMessagesToolChoice,
 )
 from litellm.types.llms.openai import (
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
 )
 from litellm.types.utils import ResponseFormatChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 from .base import BaseLLM
 from .prompt_templates.factory import (
    construct_tool_use_system_prompt,
    contains_tag,
    custom_prompt,
    extract_between_tags,
    parse_xml_params,
    prompt_factory,
    response_schema_prompt,
 )
 class VertexAIError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
        self.request = httpx.Request(
            method="POST", url=" https://cloud.google.com/vertex-ai/"
        )
        self.response = httpx.Response(status_code=status_code, request=self.request)
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class VertexAILlama3Config:
    """
    Reference:https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama#streaming
    The class `VertexAILlama3Config` provides configuration for the VertexAI's Llama API interface. Below are the parameters:
    - `max_tokens` Required (integer) max tokens,
    Note: Please make sure to modify the default parameters as required for your use case.
    """
    max_tokens: Optional[int] = None
    def __init__(
        self,
        max_tokens: Optional[int] = None,
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
            if key == "max_tokens" and value is None:
                value = self.max_tokens
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
    @classmethod
    def get_config(cls):
        return {
            k: v
            for k, v in cls.__dict__.items()
            if not k.startswith("__")
            and not isinstance(
                v,
                (
                    types.FunctionType,
                    types.BuiltinFunctionType,
                    classmethod,
                    staticmethod,
                ),
            )
            and v is not None
        }
    def get_supported_openai_params(self):
        return [
            "max_tokens",
            "stream",
        ]
    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
            if param == "max_tokens":
                optional_params["max_tokens"] = value
        return optional_params
 class VertexAILlama3(BaseLLM):
    def __init__(self) -> None:
        pass
    def create_vertex_llama3_url(
        self, vertex_location: str, vertex_project: str
    ) -> str:
        return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/openapi"
    def completion(
        self,
        model: str,
        messages: list,
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
        logging_obj,
        optional_params: dict,
        custom_prompt_dict: dict,
        headers: Optional[dict],
        timeout: Union[float, httpx.Timeout],
        vertex_project=None,
        vertex_location=None,
        vertex_credentials=None,
        litellm_params=None,
        logger_fn=None,
        acompletion: bool = False,
        client=None,
    ):
        try:
            import vertexai
            from google.cloud import aiplatform
            from litellm.llms.openai import OpenAIChatCompletion
            from litellm.llms.vertex_httpx import VertexLLM
        except Exception:
            raise VertexAIError(
                status_code=400,
                message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""",
            )
        if not (
            hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
        ):
            raise VertexAIError(
                status_code=400,
                message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
            )
        try:
            vertex_httpx_logic = VertexLLM()
            access_token, project_id = vertex_httpx_logic._ensure_access_token(
                credentials=vertex_credentials, project_id=vertex_project
            )
            openai_chat_completions = OpenAIChatCompletion()
            ## Load Config
            # config = litellm.VertexAILlama3.get_config()
            # for k, v in config.items():
            #     if k not in optional_params:
            #         optional_params[k] = v
            ## CONSTRUCT API BASE
            stream: bool = optional_params.get("stream", False) or False
            optional_params["stream"] = stream
            api_base = self.create_vertex_llama3_url(
                vertex_location=vertex_location or "us-central1",
                vertex_project=vertex_project or project_id,
            )
            return openai_chat_completions.completion(
                model=model,
                messages=messages,
                api_base=api_base,
                api_key=access_token,
                custom_prompt_dict=custom_prompt_dict,
                model_response=model_response,
                print_verbose=print_verbose,
                logging_obj=logging_obj,
                optional_params=optional_params,
                acompletion=acompletion,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                client=client,
                timeout=timeout,
            )
        except Exception as e:
            raise VertexAIError(status_code=500, message=str(e))
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
@ -1189,7 +1189,7 @@ class VertexLLM(BaseLLM):
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
-            raise VertexAIError(status_code=error_code, message=response.text)
+            raise VertexAIError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException:
            raise VertexAIError(status_code=408, message="Timeout error occurred.")
--- a/litellm/main.py
+++ b/litellm/main.py
@ -120,6 +120,7 @@ from .llms.prompt_templates.factory import (
 )
 from .llms.text_completion_codestral import CodestralTextCompletion
 from .llms.triton import TritonChatCompletion
 from .llms.vertex_ai_llama import VertexAILlama3
 from .llms.vertex_httpx import VertexLLM
 from .llms.watsonx import IBMWatsonXAI
 from .types.llms.openai import HttpxBinaryResponseContent
@ -156,6 +157,7 @@ triton_chat_completions = TritonChatCompletion()
 bedrock_chat_completion = BedrockLLM()
 bedrock_converse_chat_completion = BedrockConverseLLM()
 vertex_chat_completion = VertexLLM()
 vertex_llama_chat_completion = VertexAILlama3()
 watsonxai = IBMWatsonXAI()
 ####### COMPLETION ENDPOINTS ################
@ -2064,7 +2066,26 @@ def completion(
                    timeout=timeout,
                    client=client,
                )
-
+            elif model.startswith("meta/"):
                model_response = vertex_llama_chat_completion.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=new_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    vertex_location=vertex_ai_location,
                    vertex_project=vertex_ai_project,
                    vertex_credentials=vertex_credentials,
                    logging_obj=logging,
                    acompletion=acompletion,
                    headers=headers,
                    custom_prompt_dict=custom_prompt_dict,
                    timeout=timeout,
                    client=client,
                )
            else:
                model_response = vertex_ai.completion(
                    model=model,
@ -2478,28 +2499,25 @@ def completion(
                return generator
            response = generator
-        
+
        elif custom_llm_provider == "triton":
-            api_base = (
+            api_base = litellm.api_base or api_base
                litellm.api_base  or api_base
            )
            model_response = triton_chat_completions.completion(
-            api_base=api_base,
+                api_base=api_base,
-            timeout=timeout, # type: ignore
+                timeout=timeout,  # type: ignore
-            model=model,
+                model=model,
-            messages=messages,
+                messages=messages,
-            model_response=model_response,
+                model_response=model_response,
-            optional_params=optional_params,
+                optional_params=optional_params,
-            logging_obj=logging,
+                logging_obj=logging,
-            stream=stream,
+                stream=stream,
-            acompletion=acompletion
+                acompletion=acompletion,
            )
            ## RESPONSE OBJECT
            response = model_response
            return response
-        
+
        elif custom_llm_provider == "cloudflare":
            api_key = (
                api_key
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1948,6 +1948,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "vertex_ai-llama_models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -895,6 +895,52 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
            pytest.fail("An unexpected exception occurred - {}".format(str(e)))
 from litellm.tests.test_completion import response_format_tests
@pytest.mark.parametrize(
    "model", ["vertex_ai/meta/llama3-405b-instruct-maas"]
 )  # "vertex_ai",
@pytest.mark.parametrize("sync_mode", [True, False])  # "vertex_ai",
@pytest.mark.asyncio
 async def test_llama_3_httpx(model, sync_mode):
    try:
        load_vertex_ai_credentials()
        litellm.set_verbose = True
        messages = [
            {
                "role": "system",
                "content": "Your name is Litellm Bot, you are a helpful assistant",
            },
            # User asks for their name and weather in San Francisco
            {
                "role": "user",
                "content": "Hello, what is your name and can you tell me the weather?",
            },
        ]
        data = {
            "model": model,
            "messages": messages,
        }
        if sync_mode:
            response = litellm.completion(**data)
        else:
            response = await litellm.acompletion(**data)
        response_format_tests(response=response)
        print(f"response: {response}")
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
        if "429 Quota exceeded" in str(e):
            pass
        else:
            pytest.fail("An unexpected exception occurred - {}".format(str(e)))
 def vertex_httpx_mock_reject_prompt_post(*args, **kwargs):
    mock_response = MagicMock()
    mock_response.status_code = 200
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -128,6 +128,19 @@ def test_azure_ai_mistral_optional_params():
    assert "user" not in optional_params
 def test_vertex_ai_llama_3_optional_params():
    litellm.vertex_llama3_models = ["meta/llama3-405b-instruct-maas"]
    litellm.drop_params = True
    optional_params = get_optional_params(
        model="meta/llama3-405b-instruct-maas",
        user="John",
        custom_llm_provider="vertex_ai",
        max_tokens=10,
        temperature=0.2,
    )
    assert "user" not in optional_params
 def test_azure_gpt_optional_params_gpt_vision():
    # for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here
    optional_params = litellm.utils.get_optional_params(
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3088,6 +3088,15 @@ def get_optional_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
        )
    elif custom_llm_provider == "vertex_ai" and model in litellm.vertex_llama3_models:
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        optional_params = litellm.VertexAILlama3Config().map_openai_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
        )
    elif custom_llm_provider == "sagemaker":
        ## check if unsupported param passed in
        supported_params = get_supported_openai_params(
@ -4189,6 +4198,9 @@ def get_supported_openai_params(
        return litellm.GoogleAIStudioGeminiConfig().get_supported_openai_params()
    elif custom_llm_provider == "vertex_ai":
        if request_type == "chat_completion":
            if model.startswith("meta/"):
                return litellm.VertexAILlama3Config().get_supported_openai_params()
            return litellm.VertexAIConfig().get_supported_openai_params()
        elif request_type == "embeddings":
            return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
@ -5752,10 +5764,12 @@ def convert_to_model_response_object(
                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
            if "created" in response_object:
-                model_response_object.created = response_object["created"]
+                model_response_object.created = response_object["created"] or int(
                    time.time()
                )
            if "id" in response_object:
-                model_response_object.id = response_object["id"]
+                model_response_object.id = response_object["id"] or str(uuid.uuid4())
            if "system_fingerprint" in response_object:
                model_response_object.system_fingerprint = response_object[
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1948,6 +1948,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "vertex_ai-llama_models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",