From 8845bd4d76605c4eb78a0a7d95fe8b29e6cd8b39 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 10:42:17 -0700 Subject: [PATCH 01/13] doc - using anthropic with litellm proxy server --- docs/my-website/docs/providers/anthropic.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md index deb640b17..496343f87 100644 --- a/docs/my-website/docs/providers/anthropic.md +++ b/docs/my-website/docs/providers/anthropic.md @@ -56,7 +56,7 @@ for chunk in response: print(chunk["choices"][0]["delta"]["content"]) # same as openai format ``` -## OpenAI Proxy Usage +## Usage with LiteLLM Proxy Here's how to call Anthropic with the LiteLLM Proxy Server @@ -69,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key" ### 2. Start the proxy - - -```bash -$ litellm --model claude-3-opus-20240229 - -# Server running on http://0.0.0.0:4000 -``` - ```yaml @@ -91,6 +83,14 @@ model_list: litellm --config /path/to/config.yaml ``` + + +```bash +$ litellm --model claude-3-opus-20240229 + +# Server running on http://0.0.0.0:4000 +``` + ### 3. Test it From 2dcd9a556776bc0747f88260deede3114d3d8140 Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Tue, 23 Jul 2024 19:12:24 +0000 Subject: [PATCH 02/13] (test - azure): Add test for Azure OIDC auth. --- litellm/tests/test_embedding.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index 39a9e7f39..940f10e88 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -196,6 +196,28 @@ def test_openai_azure_embedding(): except Exception as e: pytest.fail(f"Error occurred: {e}") +@pytest.mark.skipif( + os.environ.get("CIRCLE_OIDC_TOKEN") is None, + reason="Cannot run without being in CircleCI Runner", +) +def test_openai_azure_embedding_with_oidc_and_cf(): + # TODO: Switch to our own Azure account, currently using ai.moda's account + os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c" + os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0" + + try: + response = embedding( + model="azure/text-embedding-ada-002", + input=["Hello"], + azure_ad_token="oidc/circleci/", + api_base="https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/eastus2-litellm", + api_version="2024-06-01", + ) + print(response) + + except Exception as e: + pytest.fail(f"Error occurred: {e}") + def test_openai_azure_embedding_optional_arg(mocker): mocked_create_embeddings = mocker.patch.object( From c3d90f9aee9700fa69c3bd119dae465e8364b0ae Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 15:23:58 -0700 Subject: [PATCH 03/13] test_anthropic_completion_input_translation_with_metadata --- litellm/tests/test_anthropic_completion.py | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/litellm/tests/test_anthropic_completion.py b/litellm/tests/test_anthropic_completion.py index cac0945d8..15d150a56 100644 --- a/litellm/tests/test_anthropic_completion.py +++ b/litellm/tests/test_anthropic_completion.py @@ -48,6 +48,42 @@ def test_anthropic_completion_input_translation(): ] +def test_anthropic_completion_input_translation_with_metadata(): + """ + Tests that cost tracking works as expected with LiteLLM Proxy + + LiteLLM Proxy will insert litellm_metadata for anthropic endpoints to track user_api_key and user_api_key_team_id + + This test ensures that the `litellm_metadata` is not present in the translated input + It ensures that `litellm.acompletion()` will receieve metadata which is a litellm specific param + """ + data = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hey, how's it going?"}], + "litellm_metadata": { + "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "user_api_key_alias": None, + "user_api_end_user_max_budget": None, + "litellm_api_version": "1.40.19", + "global_max_parallel_requests": None, + "user_api_key_user_id": "default_user_id", + "user_api_key_org_id": None, + "user_api_key_team_id": None, + "user_api_key_team_alias": None, + "user_api_key_team_max_budget": None, + "user_api_key_team_spend": None, + "user_api_key_spend": 0.0, + "user_api_key_max_budget": None, + "user_api_key_metadata": {}, + }, + } + translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data) + + assert "litellm_metadata" not in translated_input + assert "metadata" in translated_input + assert translated_input["metadata"] == data["litellm_metadata"] + + def test_anthropic_completion_e2e(): litellm.set_verbose = True From 4c1ee1e282bb65a70a6b43a43d207fd3c955b05a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 15:25:46 -0700 Subject: [PATCH 04/13] fix add better debugging _PROXY_track_cost_callback --- litellm/proxy/proxy_server.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 0ac1d82e0..106b95453 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -657,7 +657,11 @@ async def _PROXY_track_cost_callback( global prisma_client, custom_db_client try: # check if it has collected an entire stream response - verbose_proxy_logger.debug("Proxy: In track_cost_callback for: %s", kwargs) + verbose_proxy_logger.debug( + "Proxy: In track_cost_callback for: kwargs=%s and completion_response: %s", + kwargs, + completion_response, + ) verbose_proxy_logger.debug( f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}" ) From a71b60d005292d6d4cdcdf2f5ba26177c3f76acd Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 15:31:30 -0700 Subject: [PATCH 05/13] Pass litellm proxy specific metadata --- litellm/llms/anthropic.py | 5 +++++ litellm/proxy/litellm_pre_call_utils.py | 3 +++ litellm/types/llms/anthropic.py | 5 ++++- litellm/types/llms/openai.py | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index da51e887d..629197d51 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -385,6 +385,11 @@ class AnthropicConfig: if "user_id" in anthropic_message_request["metadata"]: new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"] + # Pass litellm proxy specific metadata + if "litellm_metadata" in anthropic_message_request: + # metadata will be passed to litellm.acompletion(), it's a litellm_param + new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata") + ## CONVERT TOOL CHOICE if "tool_choice" in anthropic_message_request: new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai( diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index 8909b1da3..7384dc30b 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -39,6 +39,9 @@ def _get_metadata_variable_name(request: Request) -> str: """ if "thread" in request.url.path or "assistant" in request.url.path: return "litellm_metadata" + if "/v1/messages" in request.url.path: + # anthropic API has a field called metadata + return "litellm_metadata" else: return "metadata" diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py index 33f413ece..b41980afd 100644 --- a/litellm/types/llms/anthropic.py +++ b/litellm/types/llms/anthropic.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import BaseModel, validator from typing_extensions import Literal, Required, TypedDict @@ -113,6 +113,9 @@ class AnthropicMessagesRequest(TypedDict, total=False): top_k: int top_p: float + # litellm param - used for tracking litellm proxy metadata in the request + litellm_metadata: dict + class ContentTextBlockDelta(TypedDict): """ diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py index 294e299db..35e442119 100644 --- a/litellm/types/llms/openai.py +++ b/litellm/types/llms/openai.py @@ -436,6 +436,7 @@ class ChatCompletionRequest(TypedDict, total=False): function_call: Union[str, dict] functions: List user: str + metadata: dict # litellm specific param class ChatCompletionDeltaChunk(TypedDict, total=False): From 169da8b8d0ee00d04043bfb05c1430a1f313dce1 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 15:39:21 -0700 Subject: [PATCH 06/13] docs(guardrails.md): add team-based controls to guardrails --- docs/my-website/docs/proxy/guardrails.md | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/my-website/docs/proxy/guardrails.md b/docs/my-website/docs/proxy/guardrails.md index 053fa8cab..2cfa3980e 100644 --- a/docs/my-website/docs/proxy/guardrails.md +++ b/docs/my-website/docs/proxy/guardrails.md @@ -266,6 +266,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ }' ``` +## Disable team from turning on/off guardrails + + +### 1. Disable team from modifying guardrails + +```bash +curl -X POST 'http://0.0.0.0:4000/team/update' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-D '{ + "team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50", + "metadata": {"guardrails": {"modify_guardrails": false}} +}' +``` + +### 2. Try to disable guardrails for a call + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +--data '{ +"model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "Think of 10 random colors." + } + ], + "metadata": {"guardrails": {"hide_secrets": false}} +}' +``` + +### 3. Get 403 Error + +``` +{ + "error": { + "message": { + "error": "Your team does not have permission to modify guardrails." + }, + "type": "auth_error", + "param": "None", + "code": 403 + } +} +``` + Expect to NOT see `+1 412-612-9992` in your server logs on your callback. :::info From 78eb5164df7d02c3369673c93afb4016523ce5c2 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 16:33:04 -0700 Subject: [PATCH 07/13] fix DB accept null values for api_base, user, etc --- litellm/proxy/schema.prisma | 14 +++++++------- schema.prisma | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma index 528d7e98d..cf61635a0 100644 --- a/litellm/proxy/schema.prisma +++ b/litellm/proxy/schema.prisma @@ -183,12 +183,12 @@ model LiteLLM_SpendLogs { model String @default("") model_id String? @default("") // the model id stored in proxy model db model_group String? @default("") // public model_name / model_group - api_base String @default("") - user String @default("") - metadata Json @default("{}") - cache_hit String @default("") - cache_key String @default("") - request_tags Json @default("[]") + api_base String? @default("") + user String? @default("") + metadata Json? @default("{}") + cache_hit String? @default("") + cache_key String? @default("") + request_tags Json? @default("[]") team_id String? end_user String? requester_ip_address String? @@ -257,4 +257,4 @@ model LiteLLM_AuditLog { object_id String // id of the object being audited. This can be the key id, team id, user id, model id before_value Json? // value of the row updated_values Json? // value of the row after change -} \ No newline at end of file +} diff --git a/schema.prisma b/schema.prisma index 970a1197e..8f4125104 100644 --- a/schema.prisma +++ b/schema.prisma @@ -172,7 +172,7 @@ model LiteLLM_Config { model LiteLLM_SpendLogs { request_id String @id call_type String - api_key String @default ("") + api_key String @default ("") // Hashed API Token. Not the actual Virtual Key. Equivalent to 'token' column in LiteLLM_VerificationToken spend Float @default(0.0) total_tokens Int @default(0) prompt_tokens Int @default(0) @@ -183,12 +183,12 @@ model LiteLLM_SpendLogs { model String @default("") model_id String? @default("") // the model id stored in proxy model db model_group String? @default("") // public model_name / model_group - api_base String @default("") - user String @default("") - metadata Json @default("{}") - cache_hit String @default("") - cache_key String @default("") - request_tags Json @default("[]") + api_base String? @default("") + user String? @default("") + metadata Json? @default("{}") + cache_hit String? @default("") + cache_key String? @default("") + request_tags Json? @default("[]") team_id String? end_user String? requester_ip_address String? From 83b13d34baf7d7423e67f22a4f024fa528323cdb Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Tue, 23 Jul 2024 16:48:50 -0700 Subject: [PATCH 08/13] =?UTF-8?q?bump:=20version=201.41.27=20=E2=86=92=201?= =?UTF-8?q?.41.28?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5dc8ab62d..8a2168d2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.41.27" +version = "1.41.28" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.41.27" +version = "1.41.28" version_files = [ "pyproject.toml:^version" ] From 83ef52e18005db1e0b6ee9756c1edebf5820887e Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 17:07:30 -0700 Subject: [PATCH 09/13] feat(vertex_ai_llama.py): vertex ai llama3.1 api support Initial working commit for vertex ai llama 3.1 api support --- litellm/llms/vertex_ai_llama.py | 270 ++++++++++++++++++ litellm/llms/vertex_httpx.py | 2 +- litellm/main.py | 50 ++-- .../tests/test_amazing_vertex_completion.py | 46 +++ litellm/utils.py | 6 +- 5 files changed, 355 insertions(+), 19 deletions(-) create mode 100644 litellm/llms/vertex_ai_llama.py diff --git a/litellm/llms/vertex_ai_llama.py b/litellm/llms/vertex_ai_llama.py new file mode 100644 index 000000000..4b5407faa --- /dev/null +++ b/litellm/llms/vertex_ai_llama.py @@ -0,0 +1,270 @@ +# What is this? +## Handler for calling llama 3.1 API on Vertex AI +import copy +import json +import os +import time +import types +import uuid +from enum import Enum +from typing import Any, Callable, List, Optional, Tuple, Union + +import httpx # type: ignore +import requests # type: ignore + +import litellm +from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.types.llms.anthropic import ( + AnthropicMessagesTool, + AnthropicMessagesToolChoice, +) +from litellm.types.llms.openai import ( + ChatCompletionToolParam, + ChatCompletionToolParamFunctionChunk, +) +from litellm.types.utils import ResponseFormatChunk +from litellm.utils import CustomStreamWrapper, ModelResponse, Usage + +from .base import BaseLLM +from .prompt_templates.factory import ( + construct_tool_use_system_prompt, + contains_tag, + custom_prompt, + extract_between_tags, + parse_xml_params, + prompt_factory, + response_schema_prompt, +) + + +class VertexAIError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + self.request = httpx.Request( + method="POST", url=" https://cloud.google.com/vertex-ai/" + ) + self.response = httpx.Response(status_code=status_code, request=self.request) + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + + +class VertexAILlama3Config: + """ + Reference:https://docs.anthropic.com/claude/reference/messages_post + + Note that the API for Claude on Vertex differs from the Anthropic API documentation in the following ways: + + - `model` is not a valid parameter. The model is instead specified in the Google Cloud endpoint URL. + - `anthropic_version` is a required parameter and must be set to "vertex-2023-10-16". + + The class `VertexAIAnthropicConfig` provides configuration for the VertexAI's Anthropic API interface. Below are the parameters: + + - `max_tokens` Required (integer) max tokens, + - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31" + - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py + - `temperature` Optional (float) The amount of randomness injected into the response + - `top_p` Optional (float) Use nucleus sampling. + - `top_k` Optional (int) Only sample from the top K options for each subsequent token + - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating + + Note: Please make sure to modify the default parameters as required for your use case. + """ + + max_tokens: Optional[int] = ( + 4096 # anthropic max - setting this doesn't impact response, but is required by anthropic. + ) + system: Optional[str] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + stop_sequences: Optional[List[str]] = None + + def __init__( + self, + max_tokens: Optional[int] = None, + anthropic_version: Optional[str] = None, + ) -> None: + locals_ = locals() + for key, value in locals_.items(): + if key == "max_tokens" and value is None: + value = self.max_tokens + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + def get_supported_openai_params(self): + return [ + "max_tokens", + "tools", + "tool_choice", + "stream", + "stop", + "temperature", + "top_p", + "response_format", + ] + + def map_openai_params(self, non_default_params: dict, optional_params: dict): + for param, value in non_default_params.items(): + if param == "max_tokens": + optional_params["max_tokens"] = value + if param == "tools": + optional_params["tools"] = value + if param == "tool_choice": + _tool_choice: Optional[AnthropicMessagesToolChoice] = None + if value == "auto": + _tool_choice = {"type": "auto"} + elif value == "required": + _tool_choice = {"type": "any"} + elif isinstance(value, dict): + _tool_choice = {"type": "tool", "name": value["function"]["name"]} + + if _tool_choice is not None: + optional_params["tool_choice"] = _tool_choice + if param == "stream": + optional_params["stream"] = value + if param == "stop": + optional_params["stop_sequences"] = value + if param == "temperature": + optional_params["temperature"] = value + if param == "top_p": + optional_params["top_p"] = value + if param == "response_format" and "response_schema" in value: + """ + When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode + - You usually want to provide a single tool + - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool + - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. + """ + _tool_choice = None + _tool_choice = {"name": "json_tool_call", "type": "tool"} + + _tool = AnthropicMessagesTool( + name="json_tool_call", + input_schema={ + "type": "object", + "properties": {"values": value["response_schema"]}, # type: ignore + }, + ) + + optional_params["tools"] = [_tool] + optional_params["tool_choice"] = _tool_choice + optional_params["json_mode"] = True + + return optional_params + + +class VertexAILlama3(BaseLLM): + def __init__(self) -> None: + pass + + def create_vertex_llama3_url( + self, vertex_location: str, vertex_project: str + ) -> str: + return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/openapi" + + def completion( + self, + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + logging_obj, + optional_params: dict, + custom_prompt_dict: dict, + headers: Optional[dict], + timeout: Union[float, httpx.Timeout], + vertex_project=None, + vertex_location=None, + vertex_credentials=None, + litellm_params=None, + logger_fn=None, + acompletion: bool = False, + client=None, + ): + try: + import vertexai + from google.cloud import aiplatform + + from litellm.llms.openai import OpenAIChatCompletion + from litellm.llms.vertex_httpx import VertexLLM + except Exception: + + raise VertexAIError( + status_code=400, + message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""", + ) + + if not ( + hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models") + ): + raise VertexAIError( + status_code=400, + message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""", + ) + try: + + vertex_httpx_logic = VertexLLM() + + access_token, project_id = vertex_httpx_logic._ensure_access_token( + credentials=vertex_credentials, project_id=vertex_project + ) + + openai_chat_completions = OpenAIChatCompletion() + + ## Load Config + # config = litellm.VertexAILlama3.get_config() + # for k, v in config.items(): + # if k not in optional_params: + # optional_params[k] = v + + ## CONSTRUCT API BASE + stream: bool = optional_params.get("stream", False) or False + + optional_params["stream"] = stream + + api_base = self.create_vertex_llama3_url( + vertex_location=vertex_location or "us-central1", + vertex_project=vertex_project or project_id, + ) + + return openai_chat_completions.completion( + model=model, + messages=messages, + api_base=api_base, + api_key=access_token, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + logging_obj=logging_obj, + optional_params=optional_params, + acompletion=acompletion, + litellm_params=litellm_params, + logger_fn=logger_fn, + client=client, + timeout=timeout, + ) + + except Exception as e: + raise VertexAIError(status_code=500, message=str(e)) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index a8de79aff..93d8f4282 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -1189,7 +1189,7 @@ class VertexLLM(BaseLLM): response.raise_for_status() except httpx.HTTPStatusError as err: error_code = err.response.status_code - raise VertexAIError(status_code=error_code, message=response.text) + raise VertexAIError(status_code=error_code, message=err.response.text) except httpx.TimeoutException: raise VertexAIError(status_code=408, message="Timeout error occurred.") diff --git a/litellm/main.py b/litellm/main.py index fad2e15cc..35fad5e02 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -120,6 +120,7 @@ from .llms.prompt_templates.factory import ( ) from .llms.text_completion_codestral import CodestralTextCompletion from .llms.triton import TritonChatCompletion +from .llms.vertex_ai_llama import VertexAILlama3 from .llms.vertex_httpx import VertexLLM from .llms.watsonx import IBMWatsonXAI from .types.llms.openai import HttpxBinaryResponseContent @@ -156,6 +157,7 @@ triton_chat_completions = TritonChatCompletion() bedrock_chat_completion = BedrockLLM() bedrock_converse_chat_completion = BedrockConverseLLM() vertex_chat_completion = VertexLLM() +vertex_llama_chat_completion = VertexAILlama3() watsonxai = IBMWatsonXAI() ####### COMPLETION ENDPOINTS ################ @@ -2064,7 +2066,26 @@ def completion( timeout=timeout, client=client, ) - + elif model.startswith("meta/"): + model_response = vertex_llama_chat_completion.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=new_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + encoding=encoding, + vertex_location=vertex_ai_location, + vertex_project=vertex_ai_project, + vertex_credentials=vertex_credentials, + logging_obj=logging, + acompletion=acompletion, + headers=headers, + custom_prompt_dict=custom_prompt_dict, + timeout=timeout, + client=client, + ) else: model_response = vertex_ai.completion( model=model, @@ -2478,28 +2499,25 @@ def completion( return generator response = generator - + elif custom_llm_provider == "triton": - api_base = ( - litellm.api_base or api_base - ) + api_base = litellm.api_base or api_base model_response = triton_chat_completions.completion( - api_base=api_base, - timeout=timeout, # type: ignore - model=model, - messages=messages, - model_response=model_response, - optional_params=optional_params, - logging_obj=logging, - stream=stream, - acompletion=acompletion + api_base=api_base, + timeout=timeout, # type: ignore + model=model, + messages=messages, + model_response=model_response, + optional_params=optional_params, + logging_obj=logging, + stream=stream, + acompletion=acompletion, ) ## RESPONSE OBJECT response = model_response return response - - + elif custom_llm_provider == "cloudflare": api_key = ( api_key diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 3def5a1ec..b9762afcb 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -895,6 +895,52 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode): pytest.fail("An unexpected exception occurred - {}".format(str(e))) +from litellm.tests.test_completion import response_format_tests + + +@pytest.mark.parametrize( + "model", ["vertex_ai/meta/llama3-405b-instruct-maas"] +) # "vertex_ai", +@pytest.mark.parametrize("sync_mode", [True, False]) # "vertex_ai", +@pytest.mark.asyncio +async def test_llama_3_httpx(model, sync_mode): + try: + load_vertex_ai_credentials() + litellm.set_verbose = True + + messages = [ + { + "role": "system", + "content": "Your name is Litellm Bot, you are a helpful assistant", + }, + # User asks for their name and weather in San Francisco + { + "role": "user", + "content": "Hello, what is your name and can you tell me the weather?", + }, + ] + + data = { + "model": model, + "messages": messages, + } + if sync_mode: + response = litellm.completion(**data) + else: + response = await litellm.acompletion(**data) + + response_format_tests(response=response) + + print(f"response: {response}") + except litellm.RateLimitError as e: + pass + except Exception as e: + if "429 Quota exceeded" in str(e): + pass + else: + pytest.fail("An unexpected exception occurred - {}".format(str(e))) + + def vertex_httpx_mock_reject_prompt_post(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 diff --git a/litellm/utils.py b/litellm/utils.py index 7f615ab61..8baced4c5 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5752,10 +5752,12 @@ def convert_to_model_response_object( model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore if "created" in response_object: - model_response_object.created = response_object["created"] + model_response_object.created = response_object["created"] or int( + time.time() + ) if "id" in response_object: - model_response_object.id = response_object["id"] + model_response_object.id = response_object["id"] or str(uuid.uuid4()) if "system_fingerprint" in response_object: model_response_object.system_fingerprint = response_object[ From 7df94100e8946b94ce74ee42dd2cce01af38b664 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 17:36:07 -0700 Subject: [PATCH 10/13] build(model_prices_and_context_window.json): add model pricing for vertex ai llama 3.1 api --- litellm/__init__.py | 2 + litellm/llms/vertex_ai_llama.py | 73 +------------------ ...odel_prices_and_context_window_backup.json | 10 +++ litellm/tests/test_optional_params.py | 13 ++++ litellm/utils.py | 12 +++ model_prices_and_context_window.json | 10 +++ 6 files changed, 50 insertions(+), 70 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index 9bb9a81cd..5eea6346c 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -357,6 +357,7 @@ vertex_text_models: List = [] vertex_code_text_models: List = [] vertex_embedding_models: List = [] vertex_anthropic_models: List = [] +vertex_llama3_models: List = [] ai21_models: List = [] nlp_cloud_models: List = [] aleph_alpha_models: List = [] @@ -828,6 +829,7 @@ from .llms.petals import PetalsConfig from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig +from .llms.vertex_ai_llama import VertexAILlama3Config from .llms.sagemaker import SagemakerConfig from .llms.ollama import OllamaConfig from .llms.ollama_chat import OllamaChatConfig diff --git a/litellm/llms/vertex_ai_llama.py b/litellm/llms/vertex_ai_llama.py index 4b5407faa..f33c127f7 100644 --- a/litellm/llms/vertex_ai_llama.py +++ b/litellm/llms/vertex_ai_llama.py @@ -53,39 +53,20 @@ class VertexAIError(Exception): class VertexAILlama3Config: """ - Reference:https://docs.anthropic.com/claude/reference/messages_post + Reference:https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama#streaming - Note that the API for Claude on Vertex differs from the Anthropic API documentation in the following ways: - - - `model` is not a valid parameter. The model is instead specified in the Google Cloud endpoint URL. - - `anthropic_version` is a required parameter and must be set to "vertex-2023-10-16". - - The class `VertexAIAnthropicConfig` provides configuration for the VertexAI's Anthropic API interface. Below are the parameters: + The class `VertexAILlama3Config` provides configuration for the VertexAI's Llama API interface. Below are the parameters: - `max_tokens` Required (integer) max tokens, - - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31" - - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py - - `temperature` Optional (float) The amount of randomness injected into the response - - `top_p` Optional (float) Use nucleus sampling. - - `top_k` Optional (int) Only sample from the top K options for each subsequent token - - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating Note: Please make sure to modify the default parameters as required for your use case. """ - max_tokens: Optional[int] = ( - 4096 # anthropic max - setting this doesn't impact response, but is required by anthropic. - ) - system: Optional[str] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - top_k: Optional[int] = None - stop_sequences: Optional[List[str]] = None + max_tokens: Optional[int] = None def __init__( self, max_tokens: Optional[int] = None, - anthropic_version: Optional[str] = None, ) -> None: locals_ = locals() for key, value in locals_.items(): @@ -115,61 +96,13 @@ class VertexAILlama3Config: def get_supported_openai_params(self): return [ "max_tokens", - "tools", - "tool_choice", "stream", - "stop", - "temperature", - "top_p", - "response_format", ] def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): if param == "max_tokens": optional_params["max_tokens"] = value - if param == "tools": - optional_params["tools"] = value - if param == "tool_choice": - _tool_choice: Optional[AnthropicMessagesToolChoice] = None - if value == "auto": - _tool_choice = {"type": "auto"} - elif value == "required": - _tool_choice = {"type": "any"} - elif isinstance(value, dict): - _tool_choice = {"type": "tool", "name": value["function"]["name"]} - - if _tool_choice is not None: - optional_params["tool_choice"] = _tool_choice - if param == "stream": - optional_params["stream"] = value - if param == "stop": - optional_params["stop_sequences"] = value - if param == "temperature": - optional_params["temperature"] = value - if param == "top_p": - optional_params["top_p"] = value - if param == "response_format" and "response_schema" in value: - """ - When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode - - You usually want to provide a single tool - - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool - - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. - """ - _tool_choice = None - _tool_choice = {"name": "json_tool_call", "type": "tool"} - - _tool = AnthropicMessagesTool( - name="json_tool_call", - input_schema={ - "type": "object", - "properties": {"values": value["response_schema"]}, # type: ignore - }, - ) - - optional_params["tools"] = [_tool] - optional_params["tool_choice"] = _tool_choice - optional_params["json_mode"] = True return optional_params diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index f86ea8bd7..e9e599945 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1948,6 +1948,16 @@ "supports_function_calling": true, "supports_vision": true }, + "vertex_ai/meta/llama3-405b-instruct-maas": { + "max_tokens": 32000, + "max_input_tokens": 32000, + "max_output_tokens": 32000, + "input_cost_per_token": 0.0, + "output_cost_per_token": 0.0, + "litellm_provider": "vertex_ai-llama_models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models" + }, "vertex_ai/imagegeneration@006": { "cost_per_image": 0.020, "litellm_provider": "vertex_ai-image-models", diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py index bbfc88710..b8011960e 100644 --- a/litellm/tests/test_optional_params.py +++ b/litellm/tests/test_optional_params.py @@ -128,6 +128,19 @@ def test_azure_ai_mistral_optional_params(): assert "user" not in optional_params +def test_vertex_ai_llama_3_optional_params(): + litellm.vertex_llama3_models = ["meta/llama3-405b-instruct-maas"] + litellm.drop_params = True + optional_params = get_optional_params( + model="meta/llama3-405b-instruct-maas", + user="John", + custom_llm_provider="vertex_ai", + max_tokens=10, + temperature=0.2, + ) + assert "user" not in optional_params + + def test_azure_gpt_optional_params_gpt_vision(): # for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here optional_params = litellm.utils.get_optional_params( diff --git a/litellm/utils.py b/litellm/utils.py index 8baced4c5..035c1c72f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3088,6 +3088,15 @@ def get_optional_params( non_default_params=non_default_params, optional_params=optional_params, ) + elif custom_llm_provider == "vertex_ai" and model in litellm.vertex_llama3_models: + supported_params = get_supported_openai_params( + model=model, custom_llm_provider=custom_llm_provider + ) + _check_valid_arg(supported_params=supported_params) + optional_params = litellm.VertexAILlama3Config().map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + ) elif custom_llm_provider == "sagemaker": ## check if unsupported param passed in supported_params = get_supported_openai_params( @@ -4189,6 +4198,9 @@ def get_supported_openai_params( return litellm.GoogleAIStudioGeminiConfig().get_supported_openai_params() elif custom_llm_provider == "vertex_ai": if request_type == "chat_completion": + if model.startswith("meta/"): + return litellm.VertexAILlama3Config().get_supported_openai_params() + return litellm.VertexAIConfig().get_supported_openai_params() elif request_type == "embeddings": return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params() diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index f86ea8bd7..e9e599945 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1948,6 +1948,16 @@ "supports_function_calling": true, "supports_vision": true }, + "vertex_ai/meta/llama3-405b-instruct-maas": { + "max_tokens": 32000, + "max_input_tokens": 32000, + "max_output_tokens": 32000, + "input_cost_per_token": 0.0, + "output_cost_per_token": 0.0, + "litellm_provider": "vertex_ai-llama_models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models" + }, "vertex_ai/imagegeneration@006": { "cost_per_image": 0.020, "litellm_provider": "vertex_ai-image-models", From ae693424e4890dad4eca1cb1565f1e612d426468 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 17:55:28 -0700 Subject: [PATCH 11/13] fix(__init__.py): update init --- litellm/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/__init__.py b/litellm/__init__.py index 5eea6346c..5a10ae77c 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -400,6 +400,9 @@ for key, value in model_cost.items(): elif value.get("litellm_provider") == "vertex_ai-anthropic_models": key = key.replace("vertex_ai/", "") vertex_anthropic_models.append(key) + elif value.get("litellm_provider") == "vertex_ai-llama_models": + key = key.replace("vertex_ai/", "") + vertex_llama3_models.append(key) elif value.get("litellm_provider") == "ai21": ai21_models.append(key) elif value.get("litellm_provider") == "nlp_cloud": From fb0a13c8bb2099a479146291a3b3f1a591dffeb6 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 21:44:24 -0700 Subject: [PATCH 12/13] fix(anthropic.py): support openai system message being a list --- litellm/llms/anthropic.py | 13 +++++++++++-- litellm/proxy/_new_secret_config.yaml | 5 ++--- litellm/tests/test_completion.py | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index 629197d51..d3a3c38a4 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -780,8 +780,17 @@ class AnthropicChatCompletion(BaseLLM): system_prompt = "" for idx, message in enumerate(messages): if message["role"] == "system": - system_prompt += message["content"] - system_prompt_indices.append(idx) + valid_content: bool = False + if isinstance(message["content"], str): + system_prompt += message["content"] + valid_content = True + elif isinstance(message["content"], list): + for content in message["content"]: + system_prompt += content.get("text", "") + valid_content = True + + if valid_content: + system_prompt_indices.append(idx) if len(system_prompt_indices) > 0: for idx in reversed(system_prompt_indices): messages.pop(idx) diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index a1af38379..7e3c9a241 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,8 +1,7 @@ model_list: - - model_name: groq-llama3 + - model_name: anthropic-claude litellm_params: - model: groq/llama3-groq-70b-8192-tool-use-preview - api_key: os.environ/GROQ_API_KEY + model: claude-3-haiku-20240307 litellm_settings: callbacks: ["logfire"] diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index c2ce836ef..31b7b8355 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -346,7 +346,7 @@ def test_completion_claude_3_empty_response(): messages = [ { "role": "system", - "content": "You are 2twNLGfqk4GMOn3ffp4p.", + "content": [{"type": "text", "text": "You are 2twNLGfqk4GMOn3ffp4p."}], }, {"role": "user", "content": "Hi gm!", "name": "ishaan"}, {"role": "assistant", "content": "Good morning! How are you doing today?"}, From d5d2ffffdfb64a6b8fdeaee04bed6fea493fb587 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 23 Jul 2024 21:54:06 -0700 Subject: [PATCH 13/13] =?UTF-8?q?bump:=20version=201.41.28=20=E2=86=92=201?= =?UTF-8?q?.42.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/my-website/docs/providers/vertex.md | 79 ++++++++++++++++++++++++ pyproject.toml | 4 +- 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 19442e11b..f87597046 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -749,6 +749,85 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ + +## Llama 3 API + +| Model Name | Function Call | +|------------------|--------------------------------------| +| meta/llama3-405b-instruct-maas | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` | + +### Usage + + + + +```python +from litellm import completion +import os + +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "" + +model = "meta/llama3-405b-instruct-maas" + +vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"] +vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"] + +response = completion( + model="vertex_ai/" + model, + messages=[{"role": "user", "content": "hi"}], + temperature=0.7, + vertex_ai_project=vertex_ai_project, + vertex_ai_location=vertex_ai_location, +) +print("\nModel Response", response) +``` + + + +**1. Add to config** + +```yaml +model_list: + - model_name: anthropic-llama + litellm_params: + model: vertex_ai/meta/llama3-405b-instruct-maas + vertex_ai_project: "my-test-project" + vertex_ai_location: "us-east-1" + - model_name: anthropic-llama + litellm_params: + model: vertex_ai/meta/llama3-405b-instruct-maas + vertex_ai_project: "my-test-project" + vertex_ai_location: "us-west-1" +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "anthropic-llama", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + ## Model Garden | Model Name | Function Call | |------------------|--------------------------------------| diff --git a/pyproject.toml b/pyproject.toml index 8a2168d2b..10246abd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.41.28" +version = "1.42.0" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.41.28" +version = "1.42.0" version_files = [ "pyproject.toml:^version" ]