From cece76c4eefd4fc445ff80d214a805b6e7b7931a Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 18 Jul 2024 18:24:06 -0700 Subject: [PATCH 01/27] feat(bedrock_httpx.py): add ai21 jamba instruct as converse model initial commit for adding ai21 jamba instruct support through bedrock converse --- litellm/llms/bedrock_httpx.py | 2 +- litellm/model_prices_and_context_window_backup.json | 10 ++++++++++ model_prices_and_context_window.json | 10 ++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index b41dd542b..1461cfd90 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -74,7 +74,7 @@ BEDROCK_CONVERSE_MODELS = [ "anthropic.claude-v2", "anthropic.claude-v2:1", "anthropic.claude-v1", - "anthropic.claude-instant-v1", + "ai21.jamba-instruct-v1:0", ] diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 5b11b8360..98bb161e1 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -2812,6 +2812,16 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "ai21.jamba-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 70000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000005, + "output_cost_per_token": 0.0000007, + "litellm_provider": "bedrock", + "mode": "chat", + "supports_system_messages": true + }, "amazon.titan-text-lite-v1": { "max_tokens": 4000, "max_input_tokens": 42000, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 5b11b8360..98bb161e1 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -2812,6 +2812,16 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "ai21.jamba-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 70000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000005, + "output_cost_per_token": 0.0000007, + "litellm_provider": "bedrock", + "mode": "chat", + "supports_system_messages": true + }, "amazon.titan-text-lite-v1": { "max_tokens": 4000, "max_input_tokens": 42000, From 96471c145e8596b3ef4af2b7bc0e3c20ab915c89 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 18 Jul 2024 19:36:50 -0700 Subject: [PATCH 02/27] fix(bedrock_httpx.py): support jamba streaming --- litellm/llms/bedrock_httpx.py | 128 ++++++++++++++++++++++++++++---- litellm/tests/test_streaming.py | 24 +++--- 2 files changed, 125 insertions(+), 27 deletions(-) diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index 1461cfd90..c3a563ce4 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -74,6 +74,7 @@ BEDROCK_CONVERSE_MODELS = [ "anthropic.claude-v2", "anthropic.claude-v2:1", "anthropic.claude-v1", + "anthropic.claude-instant-v1", "ai21.jamba-instruct-v1:0", ] @@ -195,13 +196,39 @@ async def make_call( if client is None: client = _get_async_httpx_client() # Create a new client if none provided - response = await client.post(api_base, headers=headers, data=data, stream=True) + response = await client.post( + api_base, + headers=headers, + data=data, + stream=True if "ai21" not in api_base else False, + ) if response.status_code != 200: raise BedrockError(status_code=response.status_code, message=response.text) - decoder = AWSEventStreamDecoder(model=model) - completion_stream = decoder.aiter_bytes(response.aiter_bytes(chunk_size=1024)) + if "ai21" in api_base: + aws_bedrock_process_response = BedrockConverseLLM() + model_response: ( + ModelResponse + ) = aws_bedrock_process_response.process_response( + model=model, + response=response, + model_response=litellm.ModelResponse(), + stream=True, + logging_obj=logging_obj, + optional_params={}, + api_key="", + data=data, + messages=messages, + print_verbose=litellm.print_verbose, + encoding=litellm.encoding, + ) # type: ignore + completion_stream: Any = MockResponseIterator(model_response=model_response) + else: + decoder = AWSEventStreamDecoder(model=model) + completion_stream = decoder.aiter_bytes( + response.aiter_bytes(chunk_size=1024) + ) # LOGGING logging_obj.post_call( @@ -233,13 +260,35 @@ def make_sync_call( if client is None: client = _get_httpx_client() # Create a new client if none provided - response = client.post(api_base, headers=headers, data=data, stream=True) + response = client.post( + api_base, + headers=headers, + data=data, + stream=True if "ai21" not in api_base else False, + ) if response.status_code != 200: raise BedrockError(status_code=response.status_code, message=response.read()) - decoder = AWSEventStreamDecoder(model=model) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + if "ai21" in api_base: + aws_bedrock_process_response = BedrockConverseLLM() + model_response: ModelResponse = aws_bedrock_process_response.process_response( + model=model, + response=response, + model_response=litellm.ModelResponse(), + stream=True, + logging_obj=logging_obj, + optional_params={}, + api_key="", + data=data, + messages=messages, + print_verbose=litellm.print_verbose, + encoding=litellm.encoding, + ) # type: ignore + completion_stream: Any = MockResponseIterator(model_response=model_response) + else: + decoder = AWSEventStreamDecoder(model=model) + completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) # LOGGING logging_obj.post_call( @@ -1348,7 +1397,7 @@ class BedrockConverseLLM(BaseLLM): response: Union[requests.Response, httpx.Response], model_response: ModelResponse, stream: bool, - logging_obj: Logging, + logging_obj: Optional[Logging], optional_params: dict, api_key: str, data: Union[dict, str], @@ -1358,12 +1407,13 @@ class BedrockConverseLLM(BaseLLM): ) -> Union[ModelResponse, CustomStreamWrapper]: ## LOGGING - logging_obj.post_call( - input=messages, - api_key=api_key, - original_response=response.text, - additional_args={"complete_input_dict": data}, - ) + if logging_obj is not None: + logging_obj.post_call( + input=messages, + api_key=api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) print_verbose(f"raw model_response: {response.text}") ## RESPONSE OBJECT @@ -1900,7 +1950,7 @@ class BedrockConverseLLM(BaseLLM): if acompletion: if isinstance(client, HTTPHandler): client = None - if stream is True and provider != "ai21": + if stream is True: return self.async_streaming( model=model, messages=messages, @@ -1937,7 +1987,7 @@ class BedrockConverseLLM(BaseLLM): client=client, ) # type: ignore - if (stream is not None and stream is True) and provider != "ai21": + if stream is not None and stream is True: streaming_response = CustomStreamWrapper( completion_stream=None, @@ -1981,7 +2031,7 @@ class BedrockConverseLLM(BaseLLM): model=model, response=response, model_response=model_response, - stream=stream, + stream=stream if isinstance(stream, bool) else False, logging_obj=logging_obj, optional_params=optional_params, api_key="", @@ -2168,3 +2218,49 @@ class AWSEventStreamDecoder: return None return chunk.decode() # type: ignore[no-any-return] + + +class MockResponseIterator: # for returning ai21 streaming responses + def __init__(self, model_response): + self.model_response = model_response + self.is_done = False + + # Sync iterator + def __iter__(self): + return self + + def _chunk_parser(self, chunk_data: ModelResponse) -> GenericStreamingChunk: + + try: + chunk_usage: litellm.Usage = getattr(chunk_data, "usage") + processed_chunk = GenericStreamingChunk( + text=chunk_data.choices[0].message.content or "", # type: ignore + tool_use=None, + is_finished=True, + finish_reason=chunk_data.choices[0].finish_reason, # type: ignore + usage=ConverseTokenUsageBlock( + inputTokens=chunk_usage.prompt_tokens, + outputTokens=chunk_usage.completion_tokens, + totalTokens=chunk_usage.total_tokens, + ), + index=0, + ) + return processed_chunk + except Exception: + raise ValueError(f"Failed to decode chunk: {chunk_data}") + + def __next__(self): + if self.is_done: + raise StopIteration + self.is_done = True + return self._chunk_parser(self.model_response) + + # Async iterator + def __aiter__(self): + return self + + async def __anext__(self): + if self.is_done: + raise StopAsyncIteration + self.is_done = True + return self._chunk_parser(self.model_response) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 8c7943893..d07aa681d 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -1312,22 +1312,22 @@ async def test_completion_replicate_llama3_streaming(sync_mode): # pytest.fail(f"Error occurred: {e}") -@pytest.mark.parametrize("sync_mode", [True]) # False +@pytest.mark.parametrize("sync_mode", [True, False]) # @pytest.mark.parametrize( - "model", + "model, region", [ - "bedrock/cohere.command-r-plus-v1:0", - "anthropic.claude-3-sonnet-20240229-v1:0", - "anthropic.claude-instant-v1", - "bedrock/ai21.j2-mid", - "mistral.mistral-7b-instruct-v0:2", - "bedrock/amazon.titan-tg1-large", - "meta.llama3-8b-instruct-v1:0", - "cohere.command-text-v14", + ["bedrock/ai21.jamba-instruct-v1:0", "us-east-1"], + ["bedrock/cohere.command-r-plus-v1:0", None], + ["anthropic.claude-3-sonnet-20240229-v1:0", None], + ["anthropic.claude-instant-v1", None], + ["mistral.mistral-7b-instruct-v0:2", None], + ["bedrock/amazon.titan-tg1-large", None], + ["meta.llama3-8b-instruct-v1:0", None], + ["cohere.command-text-v14", None], ], ) @pytest.mark.asyncio -async def test_bedrock_httpx_streaming(sync_mode, model): +async def test_bedrock_httpx_streaming(sync_mode, model, region): try: litellm.set_verbose = True if sync_mode: @@ -1337,6 +1337,7 @@ async def test_bedrock_httpx_streaming(sync_mode, model): messages=messages, max_tokens=10, # type: ignore stream=True, + aws_region_name=region, ) complete_response = "" # Add any assertions here to check the response @@ -1358,6 +1359,7 @@ async def test_bedrock_httpx_streaming(sync_mode, model): messages=messages, max_tokens=100, # type: ignore stream=True, + aws_region_name=region, ) complete_response = "" # Add any assertions here to check the response From dfc674622bcc1bdd369d183cf4fad0c180b5a123 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 16:55:50 -0700 Subject: [PATCH 03/27] litellm router - use free / paid tier --- litellm/types/router.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/types/router.py b/litellm/types/router.py index e7b8971bc..df9947c26 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -91,6 +91,7 @@ class ModelInfo(BaseModel): base_model: Optional[str] = ( None # specify if the base model is azure/gpt-3.5-turbo etc for accurate cost tracking ) + tier: Optional[Literal["free", "paid"]] = None def __init__(self, id: Optional[Union[str, int]] = None, **params): if id is None: @@ -328,6 +329,7 @@ class LiteLLMParamsTypedDict(TypedDict, total=False): class DeploymentTypedDict(TypedDict): model_name: str litellm_params: LiteLLMParamsTypedDict + model_info: ModelInfo SPECIAL_MODEL_INFO_PARAMS = [ From 229b7a649378715e7e6ed605a6738dfc92dd7764 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:06:06 -0700 Subject: [PATCH 04/27] helper to get_deployments_for_tier --- litellm/router_strategy/free_paid_tiers.py | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 litellm/router_strategy/free_paid_tiers.py diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py new file mode 100644 index 000000000..4328bd84c --- /dev/null +++ b/litellm/router_strategy/free_paid_tiers.py @@ -0,0 +1,64 @@ +""" +Use this to route requests between free and paid tiers +""" + +from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast + +from litellm._logging import verbose_logger +from litellm.types.router import DeploymentTypedDict + + +class ModelInfo(TypedDict): + tier: Literal["free", "paid"] + + +class Deployment(TypedDict): + model_info: ModelInfo + + +async def get_deployments_for_tier( + request_kwargs: dict, + healthy_deployments: Optional[ + Union[List[DeploymentTypedDict], List[Dict[str, Any]]] + ] = None, +): + """ + if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models + """ + verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) + if "metadata" in request_kwargs: + metadata = request_kwargs["metadata"] + if "tier" in metadata: + selected_tier: Literal["free", "paid"] = metadata["tier"] + if healthy_deployments is None: + return None + + if selected_tier == "free": + # get all deployments where model_info has tier = free + free_deployments: List[Any] = [] + verbose_logger.debug( + "Getting deployments in free tier, all_deployments: %s", + healthy_deployments, + ) + for deployment in healthy_deployments: + typed_deployment = cast(Deployment, deployment) + if typed_deployment["model_info"]["tier"] == "free": + free_deployments.append(deployment) + verbose_logger.debug("free_deployments: %s", free_deployments) + return free_deployments + + elif selected_tier == "paid": + # get all deployments where model_info has tier = paid + paid_deployments: List[Any] = [] + for deployment in healthy_deployments: + typed_deployment = cast(Deployment, deployment) + if typed_deployment["model_info"]["tier"] == "paid": + paid_deployments.append(deployment) + verbose_logger.debug("paid_deployments: %s", paid_deployments) + return paid_deployments + + verbose_logger.debug( + "no tier found in metadata, returning healthy_deployments: %s", + healthy_deployments, + ) + return healthy_deployments From 0e70b5df14c5dc25c29a2424923cb7d801a1500d Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:09:42 -0700 Subject: [PATCH 05/27] router - use free paid tier routing --- litellm/router.py | 7 ++ litellm/router_strategy/free_paid_tiers.py | 13 +++- litellm/tests/test_router_tiers.py | 90 ++++++++++++++++++++++ 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 litellm/tests/test_router_tiers.py diff --git a/litellm/router.py b/litellm/router.py index 2f72b8142..487d5fd6a 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -47,6 +47,7 @@ from litellm.assistants.main import AssistantDeleted from litellm.caching import DualCache, InMemoryCache, RedisCache from litellm.integrations.custom_logger import CustomLogger from litellm.llms.azure import get_azure_ad_token_from_oidc +from litellm.router_strategy.free_paid_tiers import get_deployments_for_tier from litellm.router_strategy.least_busy import LeastBusyLoggingHandler from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler @@ -4481,6 +4482,12 @@ class Router: request_kwargs=request_kwargs, ) + # check free / paid tier for each deployment + healthy_deployments = await get_deployments_for_tier( + request_kwargs=request_kwargs, + healthy_deployments=healthy_deployments, + ) + if len(healthy_deployments) == 0: if _allowed_model_region is None: _allowed_model_region = "n/a" diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py index 4328bd84c..82e38b4f5 100644 --- a/litellm/router_strategy/free_paid_tiers.py +++ b/litellm/router_strategy/free_paid_tiers.py @@ -17,14 +17,19 @@ class Deployment(TypedDict): async def get_deployments_for_tier( - request_kwargs: dict, - healthy_deployments: Optional[ - Union[List[DeploymentTypedDict], List[Dict[str, Any]]] - ] = None, + request_kwargs: Optional[Dict[Any, Any]] = None, + healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, ): """ if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models """ + if request_kwargs is None: + verbose_logger.debug( + "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", + healthy_deployments, + ) + return healthy_deployments + verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) if "metadata" in request_kwargs: metadata = request_kwargs["metadata"] diff --git a/litellm/tests/test_router_tiers.py b/litellm/tests/test_router_tiers.py new file mode 100644 index 000000000..54e67ded3 --- /dev/null +++ b/litellm/tests/test_router_tiers.py @@ -0,0 +1,90 @@ +#### What this tests #### +# This tests litellm router + +import asyncio +import os +import sys +import time +import traceback + +import openai +import pytest + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import logging +import os +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx +from dotenv import load_dotenv + +import litellm +from litellm import Router +from litellm._logging import verbose_logger + +verbose_logger.setLevel(logging.DEBUG) + + +load_dotenv() + + +@pytest.mark.asyncio() +async def test_router_free_paid_tier(): + """ + Pass list of orgs in 1 model definition, + expect a unique deployment for each to be created + """ + router = litellm.Router( + model_list=[ + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4o", + "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + }, + "model_info": {"tier": "paid", "id": "very-expensive-model"}, + }, + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4o-mini", + "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + }, + "model_info": {"tier": "free", "id": "very-cheap-model"}, + }, + ] + ) + + for _ in range(5): + # this should pick model with id == very-cheap-model + response = await router.acompletion( + model="gpt-4", + messages=[{"role": "user", "content": "Tell me a joke."}], + metadata={"tier": "free"}, + ) + + print("Response: ", response) + + response_extra_info = response._hidden_params + print("response_extra_info: ", response_extra_info) + + assert response_extra_info["model_id"] == "very-cheap-model" + + for _ in range(5): + # this should pick model with id == very-cheap-model + response = await router.acompletion( + model="gpt-4", + messages=[{"role": "user", "content": "Tell me a joke."}], + metadata={"tier": "paid"}, + ) + + print("Response: ", response) + + response_extra_info = response._hidden_params + print("response_extra_info: ", response_extra_info) + + assert response_extra_info["model_id"] == "very-expensive-model" From de8c92b11db8877241edd2b77ab4af2f1515a420 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:15:47 -0700 Subject: [PATCH 06/27] feat - enterprise --- litellm/proxy/litellm_pre_call_utils.py | 29 ++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index eaa2303ba..283f31e3c 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional from fastapi import Request from litellm._logging import verbose_logger, verbose_proxy_logger -from litellm.proxy._types import UserAPIKeyAuth +from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth from litellm.types.utils import SupportedCacheControls if TYPE_CHECKING: @@ -95,15 +95,6 @@ async def add_litellm_data_to_request( cache_dict = parse_cache_control(cache_control_header) data["ttl"] = cache_dict.get("s-maxage") - ### KEY-LEVEL CACHNG - key_metadata = user_api_key_dict.metadata - if "cache" in key_metadata: - data["cache"] = {} - if isinstance(key_metadata["cache"], dict): - for k, v in key_metadata["cache"].items(): - if k in SupportedCacheControls: - data["cache"][k] = v - verbose_proxy_logger.debug("receiving data: %s", data) _metadata_variable_name = _get_metadata_variable_name(request) @@ -133,6 +124,24 @@ async def add_litellm_data_to_request( user_api_key_dict, "team_alias", None ) + ### KEY-LEVEL Contorls + key_metadata = user_api_key_dict.metadata + if "cache" in key_metadata: + data["cache"] = {} + if isinstance(key_metadata["cache"], dict): + for k, v in key_metadata["cache"].items(): + if k in SupportedCacheControls: + data["cache"][k] = v + if "tier" in key_metadata: + if premium_user is not True: + verbose_logger.warning( + "Trying to use free/paid tier feature. This will not be applied %s", + CommonProxyErrors.not_premium_user.value, + ) + + # add request tier to metadata + data[_metadata_variable_name]["tier"] = key_metadata["tier"] + # Team spend, budget - used by prometheus.py data[_metadata_variable_name][ "user_api_key_team_max_budget" From 59d599d5fdafe92ecef79c8311f8caf21ec01b6a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:31:02 -0700 Subject: [PATCH 07/27] test adding free / paid tier to metadata --- litellm/tests/test_litellm_pre_call_utils.py | 60 ++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 litellm/tests/test_litellm_pre_call_utils.py diff --git a/litellm/tests/test_litellm_pre_call_utils.py b/litellm/tests/test_litellm_pre_call_utils.py new file mode 100644 index 000000000..7f56d693d --- /dev/null +++ b/litellm/tests/test_litellm_pre_call_utils.py @@ -0,0 +1,60 @@ +""" +Tests litellm pre_call_utils +""" + +import os +import sys +import traceback +import uuid +from datetime import datetime + +from dotenv import load_dotenv +from fastapi import Request +from fastapi.routing import APIRoute + +from litellm.proxy._types import UserAPIKeyAuth +from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request +from litellm.proxy.proxy_server import ProxyConfig, chat_completion + +load_dotenv() +import io +import os +import time + +import pytest + +# this file is to test litellm/proxy + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path + + +@pytest.mark.parametrize("tier", ["free", "paid"]) +@pytest.mark.asyncio() +async def test_adding_key_tier_to_request_metadata(tier): + """ + Tests if we can add tier: free/paid from key metadata to the request metadata + """ + data = {} + + api_route = APIRoute(path="/chat/completions", endpoint=chat_completion) + request = Request( + { + "type": "http", + "method": "POST", + "route": api_route, + "path": api_route.path, + "headers": [], + } + ) + new_data = await add_litellm_data_to_request( + data=data, + request=request, + user_api_key_dict=UserAPIKeyAuth(metadata={"tier": tier}), + proxy_config=ProxyConfig(), + ) + + print("new_data", new_data) + + assert new_data["metadata"]["tier"] == tier From 9f02fb5a33b9a9ddbb4557f2695e7b8c702251d4 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 18:01:21 -0700 Subject: [PATCH 08/27] docs using free, paid tier --- docs/my-website/docs/proxy/free_paid_tier.md | 102 +++++++++++++++++++ docs/my-website/sidebars.js | 3 +- 2 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 docs/my-website/docs/proxy/free_paid_tier.md diff --git a/docs/my-website/docs/proxy/free_paid_tier.md b/docs/my-website/docs/proxy/free_paid_tier.md new file mode 100644 index 000000000..01230e1f0 --- /dev/null +++ b/docs/my-website/docs/proxy/free_paid_tier.md @@ -0,0 +1,102 @@ +# πŸ’Έ Free, Paid Tier Routing + +Route Virtual Keys on `free tier` to cheaper models + +### 1. Define free, paid tier models on config.yaml + +:::info +Requests with `model=gpt-4` will be routed to either `openai/fake` or `openai/gpt-4o` depending on which tier the virtual key is on +::: + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + model_info: + tier: free # πŸ‘ˆ Key Change - set `tier to paid or free` + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + model_info: + tier: paid # πŸ‘ˆ Key Change - set `tier to paid or free` + +general_settings: + master_key: sk-1234 +``` + +### 2. Create Virtual Keys with pricing `tier=free` + +```shell +curl --location 'http://0.0.0.0:4000/key/generate' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "metadata": {"tier": "free"} +}' +``` + +### 3. Make Request with Key on `Free Tier` + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-inxzoSurQsjog9gPrVOCcA" \ + -d '{ + "model": "gpt-4", + "messages": [ + {"role": "user", "content": "Hello, Claude gm!"} + ] + }' +``` + +**Expected Response** + +If this worked as expected then `x-litellm-model-api-base` should be `https://exampleopenaiendpoint-production.up.railway.app/` in the response headers + +```shell +x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/ + +{"id":"chatcmpl-657b750f581240c1908679ed94b31bfe","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}}% +``` + + +### 4. Create Virtual Keys with pricing `tier=paid` + +```shell +curl --location 'http://0.0.0.0:4000/key/generate' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "metadata": {"tier": "paid"} + }' +``` + +### 5. Make Request with Key on `Paid Tier` + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-mnJoeSc6jFjzZr256q-iqA" \ + -d '{ + "model": "gpt-4", + "messages": [ + {"role": "user", "content": "Hello, Claude gm!"} + ] + }' +``` + +**Expected Response** + +If this worked as expected then `x-litellm-model-api-base` should be `https://api.openai.com` in the response headers + +```shell +x-litellm-model-api-base: https://api.openai.com + +{"id":"chatcmpl-9mW75EbJCgwmLcO0M5DmwxpiBgWdc","choices":[{"finish_reason":"stop","index":0,"message":{"content":"Good morning! How can I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1721350215,"model":"gpt-4o-2024-05-13","object":"chat.completion","system_fingerprint":"fp_c4e5b6fa31","usage":{"completion_tokens":10,"prompt_tokens":12,"total_tokens":22}} +``` diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index d2179cafc..a74543c87 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -43,11 +43,12 @@ const sidebars = { "proxy/reliability", "proxy/cost_tracking", "proxy/self_serve", + "proxy/virtual_keys", + "proxy/free_paid_tier", "proxy/users", "proxy/team_budgets", "proxy/customers", "proxy/billing", - "proxy/virtual_keys", "proxy/guardrails", "proxy/token_auth", "proxy/alerting", From 946db012d4990dec3ee38ad4556692d2401ae55d Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 18:01:30 -0700 Subject: [PATCH 09/27] docs free/paid tier --- litellm/proxy/proxy_config.yaml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 3f3b0858e..7e78cf317 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -1,23 +1,19 @@ model_list: - - model_name: fake-openai-endpoint + - model_name: gpt-4 litellm_params: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - - model_name: gemini-flash - litellm_params: - model: gemini/gemini-1.5-flash - - model_name: whisper - litellm_params: - model: whisper-1 - api_key: sk-******* - max_file_size_mb: 1000 model_info: - mode: audio_transcription + tier: free # πŸ‘ˆ Key Change - set `tier` + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + model_info: + tier: paid # πŸ‘ˆ Key Change - set `tier` general_settings: master_key: sk-1234 -litellm_settings: - success_callback: ["langsmith"] From f8bdfe7cc382787d75a907dbdd86e20a85476857 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 18:16:00 -0700 Subject: [PATCH 10/27] fix test amazing vertex medlm --- .../tests/test_amazing_vertex_completion.py | 66 +++++++------------ 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 6a381022e..b8ba54cb4 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -36,6 +36,20 @@ litellm.cache = None user_message = "Write a short poem about the sky" messages = [{"content": user_message, "role": "user"}] +VERTEX_MODELS_TO_NOT_TEST = [ + "medlm-medium", + "medlm-large", + "code-gecko", + "code-gecko@001", + "code-gecko@002", + "code-gecko@latest", + "codechat-bison@latest", + "code-bison@001", + "text-bison@001", + "gemini-1.5-pro", + "gemini-1.5-pro-preview-0215", +] + def get_vertex_ai_creds_json() -> dict: # Define the path to the vertex_key.json file @@ -327,17 +341,7 @@ def test_vertex_ai(): test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: try: - if model in [ - "code-gecko", - "code-gecko@001", - "code-gecko@002", - "code-gecko@latest", - "codechat-bison@latest", - "code-bison@001", - "text-bison@001", - "gemini-1.5-pro", - "gemini-1.5-pro-preview-0215", - ] or ( + if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model @@ -382,17 +386,7 @@ def test_vertex_ai_stream(): test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: try: - if model in [ - "code-gecko", - "code-gecko@001", - "code-gecko@002", - "code-gecko@latest", - "codechat-bison@latest", - "code-bison@001", - "text-bison@001", - "gemini-1.5-pro", - "gemini-1.5-pro-preview-0215", - ] or ( + if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model @@ -437,17 +431,9 @@ async def test_async_vertexai_response(): test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: print(f"model being tested in async call: {model}") - if model in [ - "code-gecko", - "code-gecko@001", - "code-gecko@002", - "code-gecko@latest", - "codechat-bison@latest", - "code-bison@001", - "text-bison@001", - "gemini-1.5-pro", - "gemini-1.5-pro-preview-0215", - ] or ("gecko" in model or "32k" in model or "ultra" in model or "002" in model): + if model in VERTEX_MODELS_TO_NOT_TEST or ( + "gecko" in model or "32k" in model or "ultra" in model or "002" in model + ): # our account does not have access to this model continue try: @@ -484,17 +470,9 @@ async def test_async_vertexai_streaming_response(): test_models = random.sample(test_models, 1) test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: - if model in [ - "code-gecko", - "code-gecko@001", - "code-gecko@002", - "code-gecko@latest", - "codechat-bison@latest", - "code-bison@001", - "text-bison@001", - "gemini-1.5-pro", - "gemini-1.5-pro-preview-0215", - ] or ("gecko" in model or "32k" in model or "ultra" in model or "002" in model): + if model in VERTEX_MODELS_TO_NOT_TEST or ( + "gecko" in model or "32k" in model or "ultra" in model or "002" in model + ): # our account does not have access to this model continue try: From f2401d6d5eadef39036feacb034dba6f5e102a74 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 18 Jul 2024 16:57:38 -0700 Subject: [PATCH 11/27] feat(vertex_ai_anthropic.py): support response_schema for vertex ai anthropic calls allows passing response_schema for anthropic calls. supports schema validation. --- .../json_validation_rule.py | 7 +- litellm/llms/anthropic.py | 71 ++++++++---- litellm/llms/vertex_ai_anthropic.py | 36 +++++- litellm/main.py | 5 + litellm/proxy/_new_secret_config.yaml | 14 ++- .../tests/test_amazing_vertex_completion.py | 104 ++++++++++++++---- 6 files changed, 189 insertions(+), 48 deletions(-) diff --git a/litellm/litellm_core_utils/json_validation_rule.py b/litellm/litellm_core_utils/json_validation_rule.py index f19144aaf..0f37e6737 100644 --- a/litellm/litellm_core_utils/json_validation_rule.py +++ b/litellm/litellm_core_utils/json_validation_rule.py @@ -13,7 +13,12 @@ def validate_schema(schema: dict, response: str): from litellm import JSONSchemaValidationError - response_dict = json.loads(response) + try: + response_dict = json.loads(response) + except json.JSONDecodeError: + raise JSONSchemaValidationError( + model="", llm_provider="", raw_response=response, schema=response + ) try: validate(response_dict, schema=schema) diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index af5ccf828..b666d9494 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -16,6 +16,7 @@ from litellm import verbose_logger from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, + HTTPHandler, _get_async_httpx_client, _get_httpx_client, ) @@ -538,7 +539,7 @@ class AnthropicChatCompletion(BaseLLM): def __init__(self) -> None: super().__init__() - def process_response( + def _process_response( self, model: str, response: Union[requests.Response, httpx.Response], @@ -551,6 +552,7 @@ class AnthropicChatCompletion(BaseLLM): messages: List, print_verbose, encoding, + json_mode: bool, ) -> ModelResponse: ## LOGGING logging_obj.post_call( @@ -574,27 +576,40 @@ class AnthropicChatCompletion(BaseLLM): ) else: text_content = "" - tool_calls = [] - for content in completion_response["content"]: + tool_calls: List[ChatCompletionToolCallChunk] = [] + for idx, content in enumerate(completion_response["content"]): if content["type"] == "text": text_content += content["text"] ## TOOL CALLING elif content["type"] == "tool_use": tool_calls.append( - { - "id": content["id"], - "type": "function", - "function": { - "name": content["name"], - "arguments": json.dumps(content["input"]), - }, - } + ChatCompletionToolCallChunk( + id=content["id"], + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=content["name"], + arguments=json.dumps(content["input"]), + ), + index=idx, + ) ) _message = litellm.Message( tool_calls=tool_calls, content=text_content or None, ) + + ## HANDLE JSON MODE - anthropic returns single function call + if json_mode and len(tool_calls) == 1: + json_mode_content_str: Optional[str] = tool_calls[0]["function"].get( + "arguments" + ) + if json_mode_content_str is not None: + args = json.loads(json_mode_content_str) + values: Optional[dict] = args.get("values") + if values is not None: + _message = litellm.Message(content=json.dumps(values)) + completion_response["stop_reason"] = "stop" model_response.choices[0].message = _message # type: ignore model_response._hidden_params["original_response"] = completion_response[ "content" @@ -687,9 +702,11 @@ class AnthropicChatCompletion(BaseLLM): _is_function_call, data: dict, optional_params: dict, + json_mode: bool, litellm_params=None, logger_fn=None, headers={}, + client=None, ) -> Union[ModelResponse, CustomStreamWrapper]: async_handler = _get_async_httpx_client() @@ -705,7 +722,7 @@ class AnthropicChatCompletion(BaseLLM): ) raise e - return self.process_response( + return self._process_response( model=model, response=response, model_response=model_response, @@ -717,6 +734,7 @@ class AnthropicChatCompletion(BaseLLM): print_verbose=print_verbose, optional_params=optional_params, encoding=encoding, + json_mode=json_mode, ) def completion( @@ -731,10 +749,12 @@ class AnthropicChatCompletion(BaseLLM): api_key, logging_obj, optional_params: dict, + timeout: Union[float, httpx.Timeout], acompletion=None, litellm_params=None, logger_fn=None, headers={}, + client=None, ): headers = validate_environment(api_key, headers, model) _is_function_call = False @@ -787,14 +807,18 @@ class AnthropicChatCompletion(BaseLLM): anthropic_tools = [] for tool in optional_params["tools"]: - new_tool = tool["function"] - new_tool["input_schema"] = new_tool.pop("parameters") # rename key - anthropic_tools.append(new_tool) + if "input_schema" in tool: # assume in anthropic format + anthropic_tools.append(tool) + else: # assume openai tool call + new_tool = tool["function"] + new_tool["input_schema"] = new_tool.pop("parameters") # rename key + anthropic_tools.append(new_tool) optional_params["tools"] = anthropic_tools stream = optional_params.pop("stream", None) is_vertex_request: bool = optional_params.pop("is_vertex_request", False) + json_mode: bool = optional_params.pop("json_mode", False) data = { "messages": messages, @@ -815,7 +839,7 @@ class AnthropicChatCompletion(BaseLLM): }, ) print_verbose(f"_is_function_call: {_is_function_call}") - if acompletion == True: + if acompletion is True: if ( stream is True ): # if function call - fake the streaming (need complete blocks for output parsing in openai format) @@ -857,15 +881,21 @@ class AnthropicChatCompletion(BaseLLM): litellm_params=litellm_params, logger_fn=logger_fn, headers=headers, + client=client, + json_mode=json_mode, ) else: ## COMPLETION CALL + if client is None or isinstance(client, AsyncHTTPHandler): + client = HTTPHandler(timeout=timeout) # type: ignore + else: + client = client if ( stream is True ): # if function call - fake the streaming (need complete blocks for output parsing in openai format) print_verbose("makes anthropic streaming POST request") data["stream"] = stream - response = requests.post( + response = client.post( api_base, headers=headers, data=json.dumps(data), @@ -889,15 +919,13 @@ class AnthropicChatCompletion(BaseLLM): return streaming_response else: - response = requests.post( - api_base, headers=headers, data=json.dumps(data) - ) + response = client.post(api_base, headers=headers, data=json.dumps(data)) if response.status_code != 200: raise AnthropicError( status_code=response.status_code, message=response.text ) - return self.process_response( + return self._process_response( model=model, response=response, model_response=model_response, @@ -909,6 +937,7 @@ class AnthropicChatCompletion(BaseLLM): print_verbose=print_verbose, optional_params=optional_params, encoding=encoding, + json_mode=json_mode, ) def embedding(self): diff --git a/litellm/llms/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_anthropic.py index b8362d5a5..900e7795f 100644 --- a/litellm/llms/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_anthropic.py @@ -7,7 +7,7 @@ import time import types import uuid from enum import Enum -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple, Union import httpx # type: ignore import requests # type: ignore @@ -15,7 +15,14 @@ import requests # type: ignore import litellm from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler -from litellm.types.llms.anthropic import AnthropicMessagesToolChoice +from litellm.types.llms.anthropic import ( + AnthropicMessagesTool, + AnthropicMessagesToolChoice, +) +from litellm.types.llms.openai import ( + ChatCompletionToolParam, + ChatCompletionToolParamFunctionChunk, +) from litellm.types.utils import ResponseFormatChunk from litellm.utils import CustomStreamWrapper, ModelResponse, Usage @@ -142,7 +149,27 @@ class VertexAIAnthropicConfig: if param == "top_p": optional_params["top_p"] = value if param == "response_format" and "response_schema" in value: - optional_params["response_format"] = ResponseFormatChunk(**value) # type: ignore + """ + When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode + - You usually want to provide a single tool + - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool + - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. + """ + _tool_choice = None + _tool_choice = {"name": "json_tool_call", "type": "tool"} + + _tool = AnthropicMessagesTool( + name="json_tool_call", + input_schema={ + "type": "object", + "properties": {"values": value["response_schema"]}, # type: ignore + }, + ) + + optional_params["tools"] = [_tool] + optional_params["tool_choice"] = _tool_choice + optional_params["json_mode"] = True + return optional_params @@ -222,6 +249,7 @@ def completion( optional_params: dict, custom_prompt_dict: dict, headers: Optional[dict], + timeout: Union[float, httpx.Timeout], vertex_project=None, vertex_location=None, vertex_credentials=None, @@ -301,6 +329,8 @@ def completion( litellm_params=litellm_params, logger_fn=logger_fn, headers=vertex_headers, + client=client, + timeout=timeout, ) except Exception as e: diff --git a/litellm/main.py b/litellm/main.py index e01603b7e..69c845ad8 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1528,6 +1528,8 @@ def completion( api_key=api_key, logging_obj=logging, headers=headers, + timeout=timeout, + client=client, ) if optional_params.get("stream", False) or acompletion == True: ## LOGGING @@ -2046,7 +2048,10 @@ def completion( acompletion=acompletion, headers=headers, custom_prompt_dict=custom_prompt_dict, + timeout=timeout, + client=client, ) + else: model_response = vertex_ai.completion( model=model, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 641c70ebc..1bd421f8d 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,5 +1,13 @@ model_list: - - model_name: llama-3 + - model_name: bad-azure-model litellm_params: - model: gpt-4 - request_timeout: 1 + model: azure/chatgpt-v-2 + azure_ad_token: "" + api_base: os.environ/AZURE_API_BASE + + - model_name: good-openai-model + litellm_params: + model: gpt-3.5-turbo + +litellm_settings: + fallbacks: [{"bad-azure-model": ["good-openai-model"]}] diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index b8ba54cb4..3def5a1ec 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -1128,6 +1128,39 @@ def vertex_httpx_mock_post_valid_response(*args, **kwargs): return mock_response +def vertex_httpx_mock_post_valid_response_anthropic(*args, **kwargs): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/json"} + mock_response.json.return_value = { + "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg", + "type": "message", + "role": "assistant", + "model": "claude-3-5-sonnet-20240620", + "content": [ + { + "type": "tool_use", + "id": "toolu_vrtx_01YMnYZrToPPfcmY2myP2gEB", + "name": "json_tool_call", + "input": { + "values": [ + {"recipe_name": "Chocolate Chip Cookies"}, + {"recipe_name": "Oatmeal Raisin Cookies"}, + {"recipe_name": "Peanut Butter Cookies"}, + {"recipe_name": "Snickerdoodle Cookies"}, + {"recipe_name": "Sugar Cookies"}, + ] + }, + } + ], + "stop_reason": "tool_use", + "stop_sequence": None, + "usage": {"input_tokens": 368, "output_tokens": 118}, + } + + return mock_response + + def vertex_httpx_mock_post_invalid_schema_response(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 @@ -1183,11 +1216,29 @@ def vertex_httpx_mock_post_invalid_schema_response(*args, **kwargs): return mock_response +def vertex_httpx_mock_post_invalid_schema_response_anthropic(*args, **kwargs): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/json"} + mock_response.json.return_value = { + "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg", + "type": "message", + "role": "assistant", + "model": "claude-3-5-sonnet-20240620", + "content": [{"text": "Hi! My name is Claude.", "type": "text"}], + "stop_reason": "end_turn", + "stop_sequence": None, + "usage": {"input_tokens": 368, "output_tokens": 118}, + } + return mock_response + + @pytest.mark.parametrize( "model, vertex_location, supports_response_schema", [ ("vertex_ai_beta/gemini-1.5-pro-001", "us-central1", True), ("vertex_ai_beta/gemini-1.5-flash", "us-central1", False), + ("vertex_ai/claude-3-5-sonnet@20240620", "us-east5", False), ], ) @pytest.mark.parametrize( @@ -1231,12 +1282,21 @@ async def test_gemini_pro_json_schema_args_sent_httpx( httpx_response = MagicMock() if invalid_response is True: - httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response + if "claude" in model: + httpx_response.side_effect = ( + vertex_httpx_mock_post_invalid_schema_response_anthropic + ) + else: + httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response else: - httpx_response.side_effect = vertex_httpx_mock_post_valid_response + if "claude" in model: + httpx_response.side_effect = vertex_httpx_mock_post_valid_response_anthropic + else: + httpx_response.side_effect = vertex_httpx_mock_post_valid_response with patch.object(client, "post", new=httpx_response) as mock_call: + print("SENDING CLIENT POST={}".format(client.post)) try: - _ = completion( + resp = completion( model=model, messages=messages, response_format={ @@ -1247,30 +1307,34 @@ async def test_gemini_pro_json_schema_args_sent_httpx( vertex_location=vertex_location, client=client, ) + print("Received={}".format(resp)) if invalid_response is True and enforce_validation is True: pytest.fail("Expected this to fail") except litellm.JSONSchemaValidationError as e: - if invalid_response is False and "claude-3" not in model: + if invalid_response is False: pytest.fail("Expected this to pass. Got={}".format(e)) mock_call.assert_called_once() - print(mock_call.call_args.kwargs) - print(mock_call.call_args.kwargs["json"]["generationConfig"]) + if "claude" not in model: + print(mock_call.call_args.kwargs) + print(mock_call.call_args.kwargs["json"]["generationConfig"]) - if supports_response_schema: - assert ( - "response_schema" - in mock_call.call_args.kwargs["json"]["generationConfig"] - ) - else: - assert ( - "response_schema" - not in mock_call.call_args.kwargs["json"]["generationConfig"] - ) - assert ( - "Use this JSON schema:" - in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1]["text"] - ) + if supports_response_schema: + assert ( + "response_schema" + in mock_call.call_args.kwargs["json"]["generationConfig"] + ) + else: + assert ( + "response_schema" + not in mock_call.call_args.kwargs["json"]["generationConfig"] + ) + assert ( + "Use this JSON schema:" + in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1][ + "text" + ] + ) @pytest.mark.parametrize("provider", ["vertex_ai_beta"]) # "vertex_ai", From af0d30e41eddd6b39369588d3c79b0cc07b7ec67 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 18 Jul 2024 17:20:19 -0700 Subject: [PATCH 12/27] docs(json_mode.md): add json mode to docs --- docs/my-website/docs/completion/json_mode.md | 137 +++++++++++++++++++ docs/my-website/sidebars.js | 1 + 2 files changed, 138 insertions(+) create mode 100644 docs/my-website/docs/completion/json_mode.md diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md new file mode 100644 index 000000000..0e7e64a8e --- /dev/null +++ b/docs/my-website/docs/completion/json_mode.md @@ -0,0 +1,137 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# JSON Mode + +## Quick Start + + + + +```python +from litellm import completion +import os + +os.environ["OPENAI_API_KEY"] = "" + +response = completion( + model="gpt-4o-mini", + response_format={ "type": "json_object" }, + messages=[ + {"role": "system", "content": "You are a helpful assistant designed to output JSON."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] +) +print(response.choices[0].message.content) +``` + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "gpt-4o-mini", + "response_format": { "type": "json_object" }, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant designed to output JSON." + }, + { + "role": "user", + "content": "Who won the world series in 2020?" + } + ] + }' +``` + + + +## Check Model Support + +Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`. + +```python +from litellm import get_supported_openai_params + +params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock") + +assert "response_format" in params +``` + +## Validate JSON Schema + +For VertexAI models, LiteLLM supports passing the `response_schema` and validating the JSON output. + +This works across Gemini (`vertex_ai_beta/`) + Anthropic (`vertex_ai/`) models. + + + + + +```python +# !gcloud auth application-default login - run this to add vertex credentials to your env + +from litellm import completion + +messages = [{"role": "user", "content": "List 5 cookie recipes"}] + +response_schema = { + "type": "array", + "items": { + "type": "object", + "properties": { + "recipe_name": { + "type": "string", + }, + }, + "required": ["recipe_name"], + }, +} + +resp = completion( + model="vertex_ai_beta/gemini-1.5-pro", + messages=messages, + response_format={ + "type": "json_object", + "response_schema": response_schema, + "enforce_validation": True, # client-side json schema validation + }, + vertex_location="us-east5", +) + +print("Received={}".format(resp)) +``` + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "vertex_ai_beta/gemini-1.5-pro", + "messages": [{"role": "user", "content": "List 5 cookie recipes"}] + "response_format": { + "type": "json_object", + "enforce_validation: true, + "response_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "recipe_name": { + "type": "string", + }, + }, + "required": ["recipe_name"], + }, + } + }, + }' +``` + + + \ No newline at end of file diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index a74543c87..eea863d8e 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -92,6 +92,7 @@ const sidebars = { items: [ "completion/input", "completion/provider_specific_params", + "completion/json_mode", "completion/drop_params", "completion/prompt_formatting", "completion/output", From b0f0898f2fea066f60ec324ecccddd1fd357cff8 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:06:06 -0700 Subject: [PATCH 13/27] helper to get_deployments_for_tier --- litellm/router_strategy/free_paid_tiers.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py index 82e38b4f5..4328bd84c 100644 --- a/litellm/router_strategy/free_paid_tiers.py +++ b/litellm/router_strategy/free_paid_tiers.py @@ -17,19 +17,14 @@ class Deployment(TypedDict): async def get_deployments_for_tier( - request_kwargs: Optional[Dict[Any, Any]] = None, - healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, + request_kwargs: dict, + healthy_deployments: Optional[ + Union[List[DeploymentTypedDict], List[Dict[str, Any]]] + ] = None, ): """ if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models """ - if request_kwargs is None: - verbose_logger.debug( - "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", - healthy_deployments, - ) - return healthy_deployments - verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) if "metadata" in request_kwargs: metadata = request_kwargs["metadata"] From d42963a0ae5d77729c05320e8ac49e2290d76e88 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 17:09:42 -0700 Subject: [PATCH 14/27] router - use free paid tier routing --- litellm/router_strategy/free_paid_tiers.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py index 4328bd84c..82e38b4f5 100644 --- a/litellm/router_strategy/free_paid_tiers.py +++ b/litellm/router_strategy/free_paid_tiers.py @@ -17,14 +17,19 @@ class Deployment(TypedDict): async def get_deployments_for_tier( - request_kwargs: dict, - healthy_deployments: Optional[ - Union[List[DeploymentTypedDict], List[Dict[str, Any]]] - ] = None, + request_kwargs: Optional[Dict[Any, Any]] = None, + healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, ): """ if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models """ + if request_kwargs is None: + verbose_logger.debug( + "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", + healthy_deployments, + ) + return healthy_deployments + verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) if "metadata" in request_kwargs: metadata = request_kwargs["metadata"] From db3063d3d3ce0330ebbe95a6d6ce366826e5e9f7 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 18 Jul 2024 22:05:10 -0700 Subject: [PATCH 15/27] test: fix test --- litellm/tests/test_proxy_exception_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litellm/tests/test_proxy_exception_mapping.py b/litellm/tests/test_proxy_exception_mapping.py index 4fb1e7134..07ae7f5a8 100644 --- a/litellm/tests/test_proxy_exception_mapping.py +++ b/litellm/tests/test_proxy_exception_mapping.py @@ -75,8 +75,8 @@ def test_chat_completion_exception(client): print("ERROR=", json_response["error"]) assert isinstance(json_response["error"]["message"], str) assert ( - json_response["error"]["message"] - == "litellm.AuthenticationError: AuthenticationError: OpenAIException - Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys." + "litellm.AuthenticationError: AuthenticationError: OpenAIException - Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys." + in json_response["error"]["message"] ) # make an openai client to call _make_status_error_from_response From 38c50e674eee6da0e18b951cc78f8ed7d7832433 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 18:22:40 -0700 Subject: [PATCH 16/27] fix ui make ui session last 24 hours --- litellm/proxy/proxy_server.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index d2337c37f..592d95777 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1734,9 +1734,7 @@ class ProxyConfig: "background_health_checks", False ) health_check_interval = general_settings.get("health_check_interval", 300) - health_check_details = general_settings.get( - "health_check_details", True - ) + health_check_details = general_settings.get("health_check_details", True) ## check if user has set a premium feature in general_settings if ( @@ -7793,7 +7791,7 @@ async def login(request: Request): request_type="key", **{ "user_role": LitellmUserRoles.PROXY_ADMIN, - "duration": "2hr", + "duration": "12hr", "key_max_budget": 5, "models": [], "aliases": {}, @@ -7857,7 +7855,7 @@ async def login(request: Request): request_type="key", **{ # type: ignore "user_role": user_role, - "duration": "2hr", + "duration": "12hr", "key_max_budget": 5, "models": [], "aliases": {}, @@ -7998,7 +7996,7 @@ async def onboarding(invite_link: str): request_type="key", **{ "user_role": user_obj.user_role, - "duration": "2hr", + "duration": "12hr", "key_max_budget": 5, "models": [], "aliases": {}, @@ -8336,7 +8334,7 @@ async def auth_callback(request: Request): # User might not be already created on first generation of key # But if it is, we want their models preferences default_ui_key_values = { - "duration": "2hr", + "duration": "12hr", "key_max_budget": 0.01, "aliases": {}, "config": {}, From ad46e6a61f20652efdbfe351af1d576a7910609b Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 19:22:09 -0700 Subject: [PATCH 17/27] router - refactor to tag based routing --- litellm/router.py | 6 +- litellm/router_strategy/free_paid_tiers.py | 69 ------------------- litellm/router_strategy/tag_based_routing.py | 68 ++++++++++++++++++ ...er_tiers.py => test_router_tag_routing.py} | 10 +-- litellm/types/router.py | 4 ++ 5 files changed, 81 insertions(+), 76 deletions(-) delete mode 100644 litellm/router_strategy/free_paid_tiers.py create mode 100644 litellm/router_strategy/tag_based_routing.py rename litellm/tests/{test_router_tiers.py => test_router_tag_routing.py} (89%) diff --git a/litellm/router.py b/litellm/router.py index 487d5fd6a..44c02f126 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -47,12 +47,12 @@ from litellm.assistants.main import AssistantDeleted from litellm.caching import DualCache, InMemoryCache, RedisCache from litellm.integrations.custom_logger import CustomLogger from litellm.llms.azure import get_azure_ad_token_from_oidc -from litellm.router_strategy.free_paid_tiers import get_deployments_for_tier from litellm.router_strategy.least_busy import LeastBusyLoggingHandler from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2 +from litellm.router_strategy.tag_based_routing import get_deployments_for_tag from litellm.router_utils.client_initalization_utils import ( set_client, should_initialize_sync_client, @@ -4482,8 +4482,8 @@ class Router: request_kwargs=request_kwargs, ) - # check free / paid tier for each deployment - healthy_deployments = await get_deployments_for_tier( + # check if user wants to do tag based routing + healthy_deployments = await get_deployments_for_tag( request_kwargs=request_kwargs, healthy_deployments=healthy_deployments, ) diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py deleted file mode 100644 index 82e38b4f5..000000000 --- a/litellm/router_strategy/free_paid_tiers.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -Use this to route requests between free and paid tiers -""" - -from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast - -from litellm._logging import verbose_logger -from litellm.types.router import DeploymentTypedDict - - -class ModelInfo(TypedDict): - tier: Literal["free", "paid"] - - -class Deployment(TypedDict): - model_info: ModelInfo - - -async def get_deployments_for_tier( - request_kwargs: Optional[Dict[Any, Any]] = None, - healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, -): - """ - if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models - """ - if request_kwargs is None: - verbose_logger.debug( - "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", - healthy_deployments, - ) - return healthy_deployments - - verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) - if "metadata" in request_kwargs: - metadata = request_kwargs["metadata"] - if "tier" in metadata: - selected_tier: Literal["free", "paid"] = metadata["tier"] - if healthy_deployments is None: - return None - - if selected_tier == "free": - # get all deployments where model_info has tier = free - free_deployments: List[Any] = [] - verbose_logger.debug( - "Getting deployments in free tier, all_deployments: %s", - healthy_deployments, - ) - for deployment in healthy_deployments: - typed_deployment = cast(Deployment, deployment) - if typed_deployment["model_info"]["tier"] == "free": - free_deployments.append(deployment) - verbose_logger.debug("free_deployments: %s", free_deployments) - return free_deployments - - elif selected_tier == "paid": - # get all deployments where model_info has tier = paid - paid_deployments: List[Any] = [] - for deployment in healthy_deployments: - typed_deployment = cast(Deployment, deployment) - if typed_deployment["model_info"]["tier"] == "paid": - paid_deployments.append(deployment) - verbose_logger.debug("paid_deployments: %s", paid_deployments) - return paid_deployments - - verbose_logger.debug( - "no tier found in metadata, returning healthy_deployments: %s", - healthy_deployments, - ) - return healthy_deployments diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py new file mode 100644 index 000000000..11bad19a3 --- /dev/null +++ b/litellm/router_strategy/tag_based_routing.py @@ -0,0 +1,68 @@ +""" +Use this to route requests between free and paid tiers +""" + +from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast + +from litellm._logging import verbose_logger +from litellm.types.router import DeploymentTypedDict + + +async def get_deployments_for_tag( + request_kwargs: Optional[Dict[Any, Any]] = None, + healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, +): + """ + if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models + """ + if request_kwargs is None: + verbose_logger.debug( + "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", + healthy_deployments, + ) + return healthy_deployments + + if healthy_deployments is None: + verbose_logger.debug( + "get_deployments_for_tier: healthy_deployments is None returning healthy_deployments" + ) + return healthy_deployments + + verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata")) + if "metadata" in request_kwargs: + metadata = request_kwargs["metadata"] + request_tags = metadata.get("tags") + + new_healthy_deployments = [] + if request_tags: + verbose_logger.debug("parameter routing: router_keys: %s", request_tags) + # example this can be router_keys=["free", "custom"] + # get all deployments that have a superset of these router keys + for deployment in healthy_deployments: + deployment_litellm_params = deployment.get("litellm_params") + deployment_tags = deployment_litellm_params.get("tags") + + verbose_logger.debug( + "deployment: %s, deployment_router_keys: %s", + deployment, + deployment_tags, + ) + + if deployment_tags is None: + continue + + if set(request_tags).issubset(set(deployment_tags)): + verbose_logger.debug( + "adding deployment with tags: %s, request tags: %s", + deployment_tags, + request_tags, + ) + new_healthy_deployments.append(deployment) + + return new_healthy_deployments + + verbose_logger.debug( + "no tier found in metadata, returning healthy_deployments: %s", + healthy_deployments, + ) + return healthy_deployments diff --git a/litellm/tests/test_router_tiers.py b/litellm/tests/test_router_tag_routing.py similarity index 89% rename from litellm/tests/test_router_tiers.py rename to litellm/tests/test_router_tag_routing.py index 54e67ded3..feb67c0e9 100644 --- a/litellm/tests/test_router_tiers.py +++ b/litellm/tests/test_router_tag_routing.py @@ -45,16 +45,18 @@ async def test_router_free_paid_tier(): "litellm_params": { "model": "gpt-4o", "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + "tags": ["free"], }, - "model_info": {"tier": "paid", "id": "very-expensive-model"}, + "model_info": {"id": "very-cheap-model"}, }, { "model_name": "gpt-4", "litellm_params": { "model": "gpt-4o-mini", "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + "tags": ["paid"], }, - "model_info": {"tier": "free", "id": "very-cheap-model"}, + "model_info": {"id": "very-expensive-model"}, }, ] ) @@ -64,7 +66,7 @@ async def test_router_free_paid_tier(): response = await router.acompletion( model="gpt-4", messages=[{"role": "user", "content": "Tell me a joke."}], - metadata={"tier": "free"}, + metadata={"tags": ["free"]}, ) print("Response: ", response) @@ -79,7 +81,7 @@ async def test_router_free_paid_tier(): response = await router.acompletion( model="gpt-4", messages=[{"role": "user", "content": "Tell me a joke."}], - metadata={"tier": "paid"}, + metadata={"tags": ["paid"]}, ) print("Response: ", response) diff --git a/litellm/types/router.py b/litellm/types/router.py index df9947c26..78dfbc4c1 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -325,6 +325,10 @@ class LiteLLMParamsTypedDict(TypedDict, total=False): ## MOCK RESPONSES ## mock_response: Optional[Union[str, ModelResponse, Exception]] + # routing params + # use this for tag-based routing + tags: Optional[List[str]] + class DeploymentTypedDict(TypedDict): model_name: str From 79c5788ad99ffee06b0850f90a8bbfc1421b5a68 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 19:24:13 -0700 Subject: [PATCH 18/27] fix remove previous code on free/paid tier --- litellm/proxy/litellm_pre_call_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index 283f31e3c..e0e875308 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -132,15 +132,6 @@ async def add_litellm_data_to_request( for k, v in key_metadata["cache"].items(): if k in SupportedCacheControls: data["cache"][k] = v - if "tier" in key_metadata: - if premium_user is not True: - verbose_logger.warning( - "Trying to use free/paid tier feature. This will not be applied %s", - CommonProxyErrors.not_premium_user.value, - ) - - # add request tier to metadata - data[_metadata_variable_name]["tier"] = key_metadata["tier"] # Team spend, budget - used by prometheus.py data[_metadata_variable_name][ From e29851503473667416311d52efa5360b2ad89236 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 19:34:45 -0700 Subject: [PATCH 19/27] fix use tags as a litellm param --- litellm/main.py | 5 +++++ litellm/proxy/proxy_config.yaml | 6 ++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index 69c845ad8..3889d1bc8 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -735,6 +735,7 @@ def completion( ] litellm_params = [ "metadata", + "tags", "acompletion", "atext_completion", "text_completion", @@ -3155,6 +3156,7 @@ def embedding( "allowed_model_region", "model_config", "cooldown_time", + "tags", ] default_params = openai_params + litellm_params non_default_params = { @@ -4384,6 +4386,8 @@ def transcription( proxy_server_request = kwargs.get("proxy_server_request", None) model_info = kwargs.get("model_info", None) metadata = kwargs.get("metadata", {}) + tags = kwargs.pop("tags", []) + drop_params = kwargs.get("drop_params", None) client: Optional[ Union[ @@ -4556,6 +4560,7 @@ def speech( ) -> HttpxBinaryResponseContent: model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore + tags = kwargs.pop("tags", []) optional_params = {} if response_format is not None: diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 7e78cf317..81ed12c07 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -4,14 +4,12 @@ model_list: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - model_info: - tier: free # πŸ‘ˆ Key Change - set `tier` + tags: ["free"] - model_name: gpt-4 litellm_params: model: openai/gpt-4o api_key: os.environ/OPENAI_API_KEY - model_info: - tier: paid # πŸ‘ˆ Key Change - set `tier` + tags: ["paid"] general_settings: master_key: sk-1234 From 52d0f6a808e53fdfdda6b26648c02586c6f49ff8 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 19:39:04 -0700 Subject: [PATCH 20/27] control using enable_tag_filtering --- litellm/router.py | 3 + litellm/router_strategy/tag_based_routing.py | 13 ++++- litellm/tests/test_litellm_pre_call_utils.py | 60 -------------------- 3 files changed, 15 insertions(+), 61 deletions(-) delete mode 100644 litellm/tests/test_litellm_pre_call_utils.py diff --git a/litellm/router.py b/litellm/router.py index 44c02f126..0e693e188 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -145,6 +145,7 @@ class Router: content_policy_fallbacks: List = [], model_group_alias: Optional[dict] = {}, enable_pre_call_checks: bool = False, + enable_tag_filtering: bool = False, retry_after: int = 0, # min time to wait before retrying a failed request retry_policy: Optional[ RetryPolicy @@ -246,6 +247,7 @@ class Router: self.set_verbose = set_verbose self.debug_level = debug_level self.enable_pre_call_checks = enable_pre_call_checks + self.enable_tag_filtering = enable_tag_filtering if self.set_verbose == True: if debug_level == "INFO": verbose_router_logger.setLevel(logging.INFO) @@ -4484,6 +4486,7 @@ class Router: # check if user wants to do tag based routing healthy_deployments = await get_deployments_for_tag( + llm_router_instance=self, request_kwargs=request_kwargs, healthy_deployments=healthy_deployments, ) diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py index 11bad19a3..2dbc5cb93 100644 --- a/litellm/router_strategy/tag_based_routing.py +++ b/litellm/router_strategy/tag_based_routing.py @@ -2,19 +2,30 @@ Use this to route requests between free and paid tiers """ -from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union from litellm._logging import verbose_logger from litellm.types.router import DeploymentTypedDict +if TYPE_CHECKING: + from litellm.router import Router as _Router + + LitellmRouter = _Router +else: + LitellmRouter = Any + async def get_deployments_for_tag( + llm_router_instance: LitellmRouter, request_kwargs: Optional[Dict[Any, Any]] = None, healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None, ): """ if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models """ + if llm_router_instance.enable_tag_filtering is not True: + return healthy_deployments + if request_kwargs is None: verbose_logger.debug( "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", diff --git a/litellm/tests/test_litellm_pre_call_utils.py b/litellm/tests/test_litellm_pre_call_utils.py deleted file mode 100644 index 7f56d693d..000000000 --- a/litellm/tests/test_litellm_pre_call_utils.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Tests litellm pre_call_utils -""" - -import os -import sys -import traceback -import uuid -from datetime import datetime - -from dotenv import load_dotenv -from fastapi import Request -from fastapi.routing import APIRoute - -from litellm.proxy._types import UserAPIKeyAuth -from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request -from litellm.proxy.proxy_server import ProxyConfig, chat_completion - -load_dotenv() -import io -import os -import time - -import pytest - -# this file is to test litellm/proxy - -sys.path.insert( - 0, os.path.abspath("../..") -) # Adds the parent directory to the system path - - -@pytest.mark.parametrize("tier", ["free", "paid"]) -@pytest.mark.asyncio() -async def test_adding_key_tier_to_request_metadata(tier): - """ - Tests if we can add tier: free/paid from key metadata to the request metadata - """ - data = {} - - api_route = APIRoute(path="/chat/completions", endpoint=chat_completion) - request = Request( - { - "type": "http", - "method": "POST", - "route": api_route, - "path": api_route.path, - "headers": [], - } - ) - new_data = await add_litellm_data_to_request( - data=data, - request=request, - user_api_key_dict=UserAPIKeyAuth(metadata={"tier": tier}), - proxy_config=ProxyConfig(), - ) - - print("new_data", new_data) - - assert new_data["metadata"]["tier"] == tier From 1ab5c1a22700f845840a1ab888c1eda84178d12f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 20:10:45 -0700 Subject: [PATCH 21/27] check if using tag based routing --- litellm/proxy/litellm_pre_call_utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index e0e875308..e6bce5392 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -75,7 +75,7 @@ async def add_litellm_data_to_request( dict: The modified data dictionary. """ - from litellm.proxy.proxy_server import premium_user + from litellm.proxy.proxy_server import llm_router, premium_user safe_add_api_version_from_query_params(data, request) @@ -166,7 +166,8 @@ async def add_litellm_data_to_request( if user_api_key_dict.allowed_model_region is not None: data["allowed_model_region"] = user_api_key_dict.allowed_model_region - ## [Enterprise Only] Add User-IP Address + ## [Enterprise Only] + # Add User-IP Address requester_ip_address = "" if premium_user is True: # Only set the IP Address for Enterprise Users @@ -179,6 +180,15 @@ async def add_litellm_data_to_request( requester_ip_address = request.client.host data[_metadata_variable_name]["requester_ip_address"] = requester_ip_address + # Enterprise Only - Check if using tag based routing + if llm_router and llm_router.enable_tag_filtering is True: + if premium_user is not True: + verbose_proxy_logger.warning( + "router.enable_tag_filtering is on %s \n switched off router.enable_tag_filtering", + CommonProxyErrors.not_premium_user.value, + ) + llm_router.enable_tag_filtering = False + ### TEAM-SPECIFIC PARAMS ### if user_api_key_dict.team_id is not None: team_config = await proxy_config.load_team_config( From 56489ad9cc97dd9e3691118eded480dadf29cbdf Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 21:48:24 -0700 Subject: [PATCH 22/27] rename doc --- docs/my-website/docs/proxy/free_paid_tier.md | 102 ------------------- docs/my-website/docs/proxy/tag_routing.md | 38 +++++++ docs/my-website/sidebars.js | 2 +- 3 files changed, 39 insertions(+), 103 deletions(-) delete mode 100644 docs/my-website/docs/proxy/free_paid_tier.md create mode 100644 docs/my-website/docs/proxy/tag_routing.md diff --git a/docs/my-website/docs/proxy/free_paid_tier.md b/docs/my-website/docs/proxy/free_paid_tier.md deleted file mode 100644 index 01230e1f0..000000000 --- a/docs/my-website/docs/proxy/free_paid_tier.md +++ /dev/null @@ -1,102 +0,0 @@ -# πŸ’Έ Free, Paid Tier Routing - -Route Virtual Keys on `free tier` to cheaper models - -### 1. Define free, paid tier models on config.yaml - -:::info -Requests with `model=gpt-4` will be routed to either `openai/fake` or `openai/gpt-4o` depending on which tier the virtual key is on -::: - -```yaml -model_list: - - model_name: gpt-4 - litellm_params: - model: openai/fake - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - model_info: - tier: free # πŸ‘ˆ Key Change - set `tier to paid or free` - - model_name: gpt-4 - litellm_params: - model: openai/gpt-4o - api_key: os.environ/OPENAI_API_KEY - model_info: - tier: paid # πŸ‘ˆ Key Change - set `tier to paid or free` - -general_settings: - master_key: sk-1234 -``` - -### 2. Create Virtual Keys with pricing `tier=free` - -```shell -curl --location 'http://0.0.0.0:4000/key/generate' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "metadata": {"tier": "free"} -}' -``` - -### 3. Make Request with Key on `Free Tier` - -```shell -curl -i http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ -curl -i http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-inxzoSurQsjog9gPrVOCcA" \ - -d '{ - "model": "gpt-4", - "messages": [ - {"role": "user", "content": "Hello, Claude gm!"} - ] - }' -``` - -**Expected Response** - -If this worked as expected then `x-litellm-model-api-base` should be `https://exampleopenaiendpoint-production.up.railway.app/` in the response headers - -```shell -x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/ - -{"id":"chatcmpl-657b750f581240c1908679ed94b31bfe","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}}% -``` - - -### 4. Create Virtual Keys with pricing `tier=paid` - -```shell -curl --location 'http://0.0.0.0:4000/key/generate' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "metadata": {"tier": "paid"} - }' -``` - -### 5. Make Request with Key on `Paid Tier` - -```shell -curl -i http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-mnJoeSc6jFjzZr256q-iqA" \ - -d '{ - "model": "gpt-4", - "messages": [ - {"role": "user", "content": "Hello, Claude gm!"} - ] - }' -``` - -**Expected Response** - -If this worked as expected then `x-litellm-model-api-base` should be `https://api.openai.com` in the response headers - -```shell -x-litellm-model-api-base: https://api.openai.com - -{"id":"chatcmpl-9mW75EbJCgwmLcO0M5DmwxpiBgWdc","choices":[{"finish_reason":"stop","index":0,"message":{"content":"Good morning! How can I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1721350215,"model":"gpt-4o-2024-05-13","object":"chat.completion","system_fingerprint":"fp_c4e5b6fa31","usage":{"completion_tokens":10,"prompt_tokens":12,"total_tokens":22}} -``` diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md new file mode 100644 index 000000000..c33bce315 --- /dev/null +++ b/docs/my-website/docs/proxy/tag_routing.md @@ -0,0 +1,38 @@ +# πŸ’Έ Tag Based Routing + +Route requests based on tags + +### 1. Define free, paid tier models on config.yaml + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + tags: ["free"] + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + tags: ["paid"] + +general_settings: + master_key: sk-1234 +``` + +### Make Request with Key on `Free Tier` + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gpt-4", + "metadata": {"tags": ["paid"]}, + "messages": [ + {"role": "user", "content": "Hello, Claude gm!"} + ] + }' +``` diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index eea863d8e..204c27394 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -44,7 +44,7 @@ const sidebars = { "proxy/cost_tracking", "proxy/self_serve", "proxy/virtual_keys", - "proxy/free_paid_tier", + "proxy/tag_routing", "proxy/users", "proxy/team_budgets", "proxy/customers", From fa26d3f96f2b5ff09b53d146006771b0086b5b50 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 21:49:36 -0700 Subject: [PATCH 23/27] fix test --- litellm/tests/test_router_tag_routing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/tests/test_router_tag_routing.py b/litellm/tests/test_router_tag_routing.py index feb67c0e9..67f100d79 100644 --- a/litellm/tests/test_router_tag_routing.py +++ b/litellm/tests/test_router_tag_routing.py @@ -58,7 +58,8 @@ async def test_router_free_paid_tier(): }, "model_info": {"id": "very-expensive-model"}, }, - ] + ], + enable_tag_filtering=True, ) for _ in range(5): From d9c051adfffe1cbee1df767105b918397e4d3ed3 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 21:55:53 -0700 Subject: [PATCH 24/27] add tags to metadata --- litellm/proxy/litellm_pre_call_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index e6bce5392..1014a325a 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -188,6 +188,9 @@ async def add_litellm_data_to_request( CommonProxyErrors.not_premium_user.value, ) llm_router.enable_tag_filtering = False + else: + if "tags" in data: + data[_metadata_variable_name]["tags"] = data["tags"] ### TEAM-SPECIFIC PARAMS ### if user_api_key_dict.team_id is not None: From 6f393be66b860999874b5d60d6f020881596fd84 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 22:18:10 -0700 Subject: [PATCH 25/27] docs - tag based routing --- docs/my-website/docs/proxy/tag_routing.md | 109 ++++++++++++++++++++-- litellm/proxy/proxy_config.yaml | 10 +- 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md index c33bce315..763d50918 100644 --- a/docs/my-website/docs/proxy/tag_routing.md +++ b/docs/my-website/docs/proxy/tag_routing.md @@ -1,8 +1,12 @@ # πŸ’Έ Tag Based Routing -Route requests based on tags +Route requests based on tags. +This is useful for implementing free / paid tiers for users -### 1. Define free, paid tier models on config.yaml +### 1. Define tags on config.yaml + +- A request with `tags=["free"]` will get routed to `openai/fake` +- A request with `tags=["paid"]` will get routed to `openai/gpt-4o` ```yaml model_list: @@ -11,18 +15,22 @@ model_list: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - tags: ["free"] + tags: ["free"] # πŸ‘ˆ Key Change - model_name: gpt-4 litellm_params: model: openai/gpt-4o api_key: os.environ/OPENAI_API_KEY - tags: ["paid"] + tags: ["paid"] # πŸ‘ˆ Key Change +router_settings: + enable_tag_filtering: True # πŸ‘ˆ Key Change general_settings: master_key: sk-1234 ``` -### Make Request with Key on `Free Tier` +### 2. Make Request with `tags=["free"]` + +This request includes "tags": ["free"], which routes it to `openai/fake` ```shell curl -i http://localhost:4000/v1/chat/completions \ @@ -30,9 +38,96 @@ curl -i http://localhost:4000/v1/chat/completions \ -H "Authorization: Bearer sk-1234" \ -d '{ "model": "gpt-4", - "metadata": {"tags": ["paid"]}, "messages": [ {"role": "user", "content": "Hello, Claude gm!"} - ] + ], + "tags": ["free"] }' ``` +**Expected Response** + +Expect to see the following response header when this works +```shell +x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/ +``` + +Response +```shell +{ + "id": "chatcmpl-33c534e3d70148218e2d62496b81270b", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "\n\nHello there, how may I assist you today?", + "role": "assistant", + "tool_calls": null, + "function_call": null + } + } + ], + "created": 1677652288, + "model": "gpt-3.5-turbo-0125", + "object": "chat.completion", + "system_fingerprint": "fp_44709d6fcb", + "usage": { + "completion_tokens": 12, + "prompt_tokens": 9, + "total_tokens": 21 + } +} +``` + + +### 3. Make Request with `tags=["paid"]` + +This request includes "tags": ["paid"], which routes it to `openai/gpt-4` + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gpt-4", + "messages": [ + {"role": "user", "content": "Hello, Claude gm!"} + ], + "tags": ["paid"] + }' +``` + +**Expected Response** + +Expect to see the following response header when this works +```shell +x-litellm-model-api-base: https://api.openai.com +``` + +Response +```shell +{ + "id": "chatcmpl-9maCcqQYTqdJrtvfakIawMOIUbEZx", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "Good morning! How can I assist you today?", + "role": "assistant", + "tool_calls": null, + "function_call": null + } + } + ], + "created": 1721365934, + "model": "gpt-4o-2024-05-13", + "object": "chat.completion", + "system_fingerprint": "fp_c4e5b6fa31", + "usage": { + "completion_tokens": 10, + "prompt_tokens": 12, + "total_tokens": 22 + } +} +``` \ No newline at end of file diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 81ed12c07..f20c780cc 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -4,14 +4,14 @@ model_list: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - tags: ["free"] + tags: ["free"] # πŸ‘ˆ Key Change - model_name: gpt-4 litellm_params: model: openai/gpt-4o api_key: os.environ/OPENAI_API_KEY - tags: ["paid"] + tags: ["paid"] # πŸ‘ˆ Key Change +router_settings: + enable_tag_filtering: True # πŸ‘ˆ Key Change general_settings: - master_key: sk-1234 - - + master_key: sk-1234 \ No newline at end of file From 780a6293dca895e7a381ef2253b3622502dc20df Mon Sep 17 00:00:00 2001 From: Marc Abramowitz Date: Thu, 18 Jul 2024 14:29:32 -0700 Subject: [PATCH 26/27] Alias /health/liveliness as /health/liveness The latter is the more common term in Kubernetes, so it's nice to support that. --- litellm/proxy/_types.py | 1 + litellm/proxy/health_endpoints/_health_endpoints.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 7464714db..7acd38e8b 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -304,6 +304,7 @@ class LiteLLMRoutes(enum.Enum): "/routes", "/", "/health/liveliness", + "/health/liveness", "/health/readiness", "/test", "/config/yaml", diff --git a/litellm/proxy/health_endpoints/_health_endpoints.py b/litellm/proxy/health_endpoints/_health_endpoints.py index 494d9aa09..fa9edcdc8 100644 --- a/litellm/proxy/health_endpoints/_health_endpoints.py +++ b/litellm/proxy/health_endpoints/_health_endpoints.py @@ -483,7 +483,12 @@ async def health_readiness(): @router.get( - "/health/liveliness", + "/health/liveliness", # Historical LiteLLM name; doesn't match k8s terminology but kept for backwards compatibility + tags=["health"], + dependencies=[Depends(user_api_key_auth)], +) +@router.get( + "/health/liveness", # Kubernetes has "liveness" probes (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command) tags=["health"], dependencies=[Depends(user_api_key_auth)], ) @@ -516,6 +521,11 @@ async def health_readiness_options(): tags=["health"], dependencies=[Depends(user_api_key_auth)], ) +@router.options( + "/health/liveness", # Kubernetes has "liveness" probes (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command) + tags=["health"], + dependencies=[Depends(user_api_key_auth)], +) async def health_liveliness_options(): """ Options endpoint for health/liveliness check. From 086486c5c35ed1025d314cf1ef9db5431bcbaf2d Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 18 Jul 2024 22:20:34 -0700 Subject: [PATCH 27/27] =?UTF-8?q?bump:=20version=201.41.24=20=E2=86=92=201?= =?UTF-8?q?.41.25?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4e5d51a76..5a1d6066d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.41.24" +version = "1.41.25" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.41.24" +version = "1.41.25" version_files = [ "pyproject.toml:^version" ]