diff --git a/litellm/router.py b/litellm/router.py index e59612a392..8b9fa765d1 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -326,9 +326,9 @@ class Router: litellm.failure_callback.append(self.deployment_callback_on_failure) else: litellm.failure_callback = [self.deployment_callback_on_failure] - verbose_router_logger.info( + print( # noqa f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}" - ) + ) # noqa self.routing_strategy_args = routing_strategy_args def print_deployment(self, deployment: dict): @@ -2616,6 +2616,11 @@ class Router: for var in vars_to_include: if var in _all_vars: _settings_to_return[var] = _all_vars[var] + if ( + var == "routing_strategy_args" + and self.routing_strategy == "latency-based-routing" + ): + _settings_to_return[var] = self.lowestlatency_logger.routing_args.json() return _settings_to_return def update_settings(self, **kwargs): diff --git a/litellm/router_strategy/lowest_latency.py b/litellm/router_strategy/lowest_latency.py index eecf5578ce..80dee5e678 100644 --- a/litellm/router_strategy/lowest_latency.py +++ b/litellm/router_strategy/lowest_latency.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator import dotenv, os, requests, random from typing import Optional, Union, List, Dict from datetime import datetime, timedelta +import random dotenv.load_dotenv() # Loading env variables using dotenv import traceback @@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel): class RoutingArgs(LiteLLMBase): ttl: int = 1 * 60 * 60 # 1 hour + lowest_latency_buffer: float = 0 class LowestLatencyLoggingHandler(CustomLogger): @@ -314,8 +316,12 @@ class LowestLatencyLoggingHandler(CustomLogger): # randomly sample from all_deployments, incase all deployments have latency=0.0 _items = all_deployments.items() + all_deployments = random.sample(list(_items), len(_items)) all_deployments = dict(all_deployments) + ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits + + potential_deployments = [] for item, item_map in all_deployments.items(): ## get the item from model list _deployment = None @@ -364,17 +370,33 @@ class LowestLatencyLoggingHandler(CustomLogger): # End of Debugging Logic # -------------- # - if item_latency == 0: - deployment = _deployment - break - elif ( + if ( item_tpm + input_tokens > _deployment_tpm or item_rpm + 1 > _deployment_rpm ): # if user passed in tpm / rpm in the model_list continue - elif item_latency < lowest_latency: - lowest_latency = item_latency - deployment = _deployment + else: + potential_deployments.append((_deployment, item_latency)) + + if len(potential_deployments) == 0: + return None + + # Sort potential deployments by latency + sorted_deployments = sorted(potential_deployments, key=lambda x: x[1]) + + # Find lowest latency deployment + lowest_latency = sorted_deployments[0][1] + + # Find deployments within buffer of lowest latency + buffer = self.routing_args.lowest_latency_buffer * lowest_latency + valid_deployments = [ + x for x in sorted_deployments if x[1] <= lowest_latency + buffer + ] + + # Pick a random deployment from valid deployments + random_valid_deployment = random.choice(valid_deployments) + deployment = random_valid_deployment[0] + if request_kwargs is not None and "metadata" in request_kwargs: request_kwargs["metadata"][ "_latency_per_deployment" diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 5d1198d538..05eece8344 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -394,6 +394,8 @@ async def test_async_vertexai_response(): pass except litellm.Timeout as e: pass + except litellm.APIError as e: + pass except Exception as e: pytest.fail(f"An exception occurred: {e}") diff --git a/litellm/tests/test_lowest_latency_routing.py b/litellm/tests/test_lowest_latency_routing.py index 24e6bb4c5d..2f0aaee91d 100644 --- a/litellm/tests/test_lowest_latency_routing.py +++ b/litellm/tests/test_lowest_latency_routing.py @@ -631,3 +631,95 @@ async def test_lowest_latency_routing_first_pick(): # assert that len(deployments) >1 assert len(deployments) > 1 + + +@pytest.mark.parametrize("buffer", [0, 1]) +@pytest.mark.asyncio +async def test_lowest_latency_routing_buffer(buffer): + """ + Allow shuffling calls within a certain latency buffer + """ + model_list = [ + { + "model_name": "azure-model", + "litellm_params": { + "model": "azure/gpt-turbo", + "api_key": "os.environ/AZURE_FRANCE_API_KEY", + "api_base": "https://openai-france-1234.openai.azure.com", + "rpm": 1440, + }, + "model_info": {"id": 1}, + }, + { + "model_name": "azure-model", + "litellm_params": { + "model": "azure/gpt-35-turbo", + "api_key": "os.environ/AZURE_EUROPE_API_KEY", + "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com", + "rpm": 6, + }, + "model_info": {"id": 2}, + }, + ] + router = Router( + model_list=model_list, + routing_strategy="latency-based-routing", + set_verbose=False, + num_retries=3, + routing_strategy_args={"lowest_latency_buffer": buffer}, + ) # type: ignore + + ## DEPLOYMENT 1 ## + deployment_id = 1 + kwargs = { + "litellm_params": { + "metadata": { + "model_group": "azure-model", + }, + "model_info": {"id": 1}, + } + } + start_time = time.time() + response_obj = {"usage": {"total_tokens": 50}} + time.sleep(3) + end_time = time.time() + router.lowestlatency_logger.log_success_event( + response_obj=response_obj, + kwargs=kwargs, + start_time=start_time, + end_time=end_time, + ) + ## DEPLOYMENT 2 ## + deployment_id = 2 + kwargs = { + "litellm_params": { + "metadata": { + "model_group": "azure-model", + }, + "model_info": {"id": 2}, + } + } + start_time = time.time() + response_obj = {"usage": {"total_tokens": 20}} + time.sleep(2) + end_time = time.time() + router.lowestlatency_logger.log_success_event( + response_obj=response_obj, + kwargs=kwargs, + start_time=start_time, + end_time=end_time, + ) + + ## CHECK WHAT'S SELECTED ## + # print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model")) + selected_deployments = {} + for _ in range(50): + print(router.get_available_deployment(model="azure-model")) + selected_deployments[ + router.get_available_deployment(model="azure-model")["model_info"]["id"] + ] = 1 + + if buffer == 0: + assert len(selected_deployments.keys()) == 1 + else: + assert len(selected_deployments.keys()) == 2 diff --git a/litellm/tests/test_router_debug_logs.py b/litellm/tests/test_router_debug_logs.py index 0bc711b157..78b3b44704 100644 --- a/litellm/tests/test_router_debug_logs.py +++ b/litellm/tests/test_router_debug_logs.py @@ -81,7 +81,6 @@ def test_async_fallbacks(caplog): # Define the expected log messages # - error request, falling back notice, success notice expected_logs = [ - "Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None", "litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m", "Falling back to model_group = azure/gpt-3.5-turbo", "litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m", diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index 364319929e..f2efa70dd2 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -766,10 +766,10 @@ def test_usage_based_routing_fallbacks(): load_dotenv() # Constants for TPM and RPM allocation - AZURE_FAST_TPM = 3 - AZURE_BASIC_TPM = 4 - OPENAI_TPM = 400 - ANTHROPIC_TPM = 100000 + AZURE_FAST_RPM = 3 + AZURE_BASIC_RPM = 4 + OPENAI_RPM = 10 + ANTHROPIC_RPM = 100000 def get_azure_params(deployment_name: str): params = { @@ -798,22 +798,26 @@ def test_usage_based_routing_fallbacks(): { "model_name": "azure/gpt-4-fast", "litellm_params": get_azure_params("chatgpt-v-2"), - "tpm": AZURE_FAST_TPM, + "model_info": {"id": 1}, + "rpm": AZURE_FAST_RPM, }, { "model_name": "azure/gpt-4-basic", "litellm_params": get_azure_params("chatgpt-v-2"), - "tpm": AZURE_BASIC_TPM, + "model_info": {"id": 2}, + "rpm": AZURE_BASIC_RPM, }, { "model_name": "openai-gpt-4", "litellm_params": get_openai_params("gpt-3.5-turbo"), - "tpm": OPENAI_TPM, + "model_info": {"id": 3}, + "rpm": OPENAI_RPM, }, { "model_name": "anthropic-claude-instant-1.2", "litellm_params": get_anthropic_params("claude-instant-1.2"), - "tpm": ANTHROPIC_TPM, + "model_info": {"id": 4}, + "rpm": ANTHROPIC_RPM, }, ] # litellm.set_verbose=True @@ -844,10 +848,10 @@ def test_usage_based_routing_fallbacks(): mock_response="very nice to meet you", ) print("response: ", response) - print("response._hidden_params: ", response._hidden_params) + print(f"response._hidden_params: {response._hidden_params}") # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM - assert response._hidden_params["custom_llm_provider"] == "openai" + assert response._hidden_params["model_id"] == "1" # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2 for i in range(20): @@ -861,7 +865,7 @@ def test_usage_based_routing_fallbacks(): print("response._hidden_params: ", response._hidden_params) if i == 19: # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2 - assert response._hidden_params["custom_llm_provider"] == "anthropic" + assert response._hidden_params["model_id"] == "4" except Exception as e: pytest.fail(f"An exception occurred {e}") diff --git a/litellm/utils.py b/litellm/utils.py index e5f7f9d11a..5c5324be13 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8130,7 +8130,10 @@ def exception_type( llm_provider="vertex_ai", response=original_exception.response, ) - elif "None Unknown Error." in error_str: + elif ( + "None Unknown Error." in error_str + or "Content has no parts." in error_str + ): exception_mapping_worked = True raise APIError( message=f"VertexAIException - {error_str}", diff --git a/ui/litellm-dashboard/src/components/general_settings.tsx b/ui/litellm-dashboard/src/components/general_settings.tsx index b9721820d7..c2013b1578 100644 --- a/ui/litellm-dashboard/src/components/general_settings.tsx +++ b/ui/litellm-dashboard/src/components/general_settings.tsx @@ -15,7 +15,13 @@ import { Grid, Button, TextInput, + Select as Select2, + SelectItem, Col, + Accordion, + AccordionBody, + AccordionHeader, + AccordionList, } from "@tremor/react"; import { TabPanel, TabPanels, TabGroup, TabList, Tab, Icon } from "@tremor/react"; import { getCallbacksCall, setCallbacksCall, serviceHealthCheck } from "./networking"; @@ -24,6 +30,7 @@ import { InformationCircleIcon, PencilAltIcon, PencilIcon, StatusOnlineIcon, Tra import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider"; import AddFallbacks from "./add_fallbacks" import openai from "openai"; +import Paragraph from "antd/es/skeleton/Paragraph"; interface GeneralSettingsPageProps { accessToken: string | null; @@ -72,6 +79,62 @@ async function testFallbackModelResponse( } } +interface AccordionHeroProps { + selectedStrategy: string | null; + strategyArgs: routingStrategyArgs; + paramExplanation: { [key: string]: string } +} + +interface routingStrategyArgs { + ttl?: number; + lowest_latency_buffer?: number; +} + +const defaultLowestLatencyArgs: routingStrategyArgs = { + "ttl": 3600, + "lowest_latency_buffer": 0 +} + +export const AccordionHero: React.FC = ({ selectedStrategy, strategyArgs, paramExplanation }) => ( + + Routing Strategy Specific Args + + { + selectedStrategy == "latency-based-routing" ? + + + + + Setting + Value + + + + {Object.entries(strategyArgs).map(([param, value]) => ( + + + {param} +

{paramExplanation[param]}

+
+ + + +
+ ))} +
+
+
+ : No specific settings + } +
+
+); + const GeneralSettings: React.FC = ({ accessToken, userRole, @@ -82,6 +145,8 @@ const GeneralSettings: React.FC = ({ const [isModalVisible, setIsModalVisible] = useState(false); const [form] = Form.useForm(); const [selectedCallback, setSelectedCallback] = useState(null); + const [selectedStrategy, setSelectedStrategy] = useState(null) + const [strategySettings, setStrategySettings] = useState(null); let paramExplanation: { [key: string]: string } = { "routing_strategy_args": "(dict) Arguments to pass to the routing strategy", @@ -91,6 +156,8 @@ const GeneralSettings: React.FC = ({ "num_retries": "(int) Number of retries for failed requests. Defaults to 0.", "timeout": "(float) Timeout for requests. Defaults to None.", "retry_after": "(int) Minimum time to wait before retrying a failed request", + "ttl": "(int) Sliding window to look back over when calculating the average latency of a deployment. Default - 1 hour (in seconds).", + "lowest_latency_buffer": "(float) Shuffle between deployments within this % of the lowest latency. Default - 0 (i.e. always pick lowest latency)." } useEffect(() => { @@ -141,6 +208,7 @@ const GeneralSettings: React.FC = ({ try { await setCallbacksCall(accessToken, payload); setRouterSettings({ ...routerSettings }); + setSelectedStrategy(routerSettings["routing_strategy"]) message.success("Router settings updated successfully"); } catch (error) { message.error("Failed to update router settings: " + error, 20); @@ -156,11 +224,33 @@ const GeneralSettings: React.FC = ({ const updatedVariables = Object.fromEntries( Object.entries(router_settings).map(([key, value]) => { - if (key !== 'routing_strategy_args') { + if (key !== 'routing_strategy_args' && key !== "routing_strategy") { return [key, (document.querySelector(`input[name="${key}"]`) as HTMLInputElement)?.value || value]; } + else if (key == "routing_strategy") { + return [key, selectedStrategy] + } + else if (key == "routing_strategy_args" && selectedStrategy == "latency-based-routing") { + let setRoutingStrategyArgs: routingStrategyArgs = {} + + const lowestLatencyBufferElement = document.querySelector(`input[name="lowest_latency_buffer"]`) as HTMLInputElement; + const ttlElement = document.querySelector(`input[name="ttl"]`) as HTMLInputElement; + + if (lowestLatencyBufferElement?.value) { + setRoutingStrategyArgs["lowest_latency_buffer"] = Number(lowestLatencyBufferElement.value) + } + + if (ttlElement?.value) { + setRoutingStrategyArgs["ttl"] = Number(ttlElement.value) + } + + console.log(`setRoutingStrategyArgs: ${setRoutingStrategyArgs}`) + return [ + "routing_strategy_args", setRoutingStrategyArgs + ] + } return null; - }).filter(entry => entry !== null) as Iterable<[string, unknown]> + }).filter(entry => entry !== null && entry !== undefined) as Iterable<[string, unknown]> ); console.log("updatedVariables", updatedVariables); @@ -183,6 +273,7 @@ const GeneralSettings: React.FC = ({ return null; } + return (
@@ -203,24 +294,41 @@ const GeneralSettings: React.FC = ({ - {Object.entries(routerSettings).filter(([param, value]) => param != "fallbacks" && param != "context_window_fallbacks").map(([param, value]) => ( + {Object.entries(routerSettings).filter(([param, value]) => param != "fallbacks" && param != "context_window_fallbacks" && param != "routing_strategy_args").map(([param, value]) => ( {param}

{paramExplanation[param]}

- + { + param == "routing_strategy" ? + + usage-based-routing + latency-based-routing + simple-shuffle + : + + }
))}
+ 0 + ? routerSettings['routing_strategy_args'] + : defaultLowestLatencyArgs // default value when keys length is 0 + } + paramExplanation={paramExplanation} + />