forked from phoenix/litellm-mirror
Merge pull request #3370 from BerriAI/litellm_latency_buffer
fix(lowest_latency.py): allow setting a buffer for getting values within a certain latency threshold
This commit is contained in:
commit
ce9ede6110
8 changed files with 266 additions and 31 deletions
|
@ -326,9 +326,9 @@ class Router:
|
|||
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
||||
else:
|
||||
litellm.failure_callback = [self.deployment_callback_on_failure]
|
||||
verbose_router_logger.info(
|
||||
print( # noqa
|
||||
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
|
||||
)
|
||||
) # noqa
|
||||
self.routing_strategy_args = routing_strategy_args
|
||||
|
||||
def print_deployment(self, deployment: dict):
|
||||
|
@ -2616,6 +2616,11 @@ class Router:
|
|||
for var in vars_to_include:
|
||||
if var in _all_vars:
|
||||
_settings_to_return[var] = _all_vars[var]
|
||||
if (
|
||||
var == "routing_strategy_args"
|
||||
and self.routing_strategy == "latency-based-routing"
|
||||
):
|
||||
_settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
|
||||
return _settings_to_return
|
||||
|
||||
def update_settings(self, **kwargs):
|
||||
|
|
|
@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
|
|||
import dotenv, os, requests, random
|
||||
from typing import Optional, Union, List, Dict
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):
|
|||
|
||||
class RoutingArgs(LiteLLMBase):
|
||||
ttl: int = 1 * 60 * 60 # 1 hour
|
||||
lowest_latency_buffer: float = 0
|
||||
|
||||
|
||||
class LowestLatencyLoggingHandler(CustomLogger):
|
||||
|
@ -314,8 +316,12 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
|
||||
# randomly sample from all_deployments, incase all deployments have latency=0.0
|
||||
_items = all_deployments.items()
|
||||
|
||||
all_deployments = random.sample(list(_items), len(_items))
|
||||
all_deployments = dict(all_deployments)
|
||||
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
|
||||
|
||||
potential_deployments = []
|
||||
for item, item_map in all_deployments.items():
|
||||
## get the item from model list
|
||||
_deployment = None
|
||||
|
@ -364,17 +370,33 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
|||
# End of Debugging Logic
|
||||
# -------------- #
|
||||
|
||||
if item_latency == 0:
|
||||
deployment = _deployment
|
||||
break
|
||||
elif (
|
||||
if (
|
||||
item_tpm + input_tokens > _deployment_tpm
|
||||
or item_rpm + 1 > _deployment_rpm
|
||||
): # if user passed in tpm / rpm in the model_list
|
||||
continue
|
||||
elif item_latency < lowest_latency:
|
||||
lowest_latency = item_latency
|
||||
deployment = _deployment
|
||||
else:
|
||||
potential_deployments.append((_deployment, item_latency))
|
||||
|
||||
if len(potential_deployments) == 0:
|
||||
return None
|
||||
|
||||
# Sort potential deployments by latency
|
||||
sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
|
||||
|
||||
# Find lowest latency deployment
|
||||
lowest_latency = sorted_deployments[0][1]
|
||||
|
||||
# Find deployments within buffer of lowest latency
|
||||
buffer = self.routing_args.lowest_latency_buffer * lowest_latency
|
||||
valid_deployments = [
|
||||
x for x in sorted_deployments if x[1] <= lowest_latency + buffer
|
||||
]
|
||||
|
||||
# Pick a random deployment from valid deployments
|
||||
random_valid_deployment = random.choice(valid_deployments)
|
||||
deployment = random_valid_deployment[0]
|
||||
|
||||
if request_kwargs is not None and "metadata" in request_kwargs:
|
||||
request_kwargs["metadata"][
|
||||
"_latency_per_deployment"
|
||||
|
|
|
@ -394,6 +394,8 @@ async def test_async_vertexai_response():
|
|||
pass
|
||||
except litellm.Timeout as e:
|
||||
pass
|
||||
except litellm.APIError as e:
|
||||
pass
|
||||
except Exception as e:
|
||||
pytest.fail(f"An exception occurred: {e}")
|
||||
|
||||
|
|
|
@ -631,3 +631,95 @@ async def test_lowest_latency_routing_first_pick():
|
|||
|
||||
# assert that len(deployments) >1
|
||||
assert len(deployments) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("buffer", [0, 1])
|
||||
@pytest.mark.asyncio
|
||||
async def test_lowest_latency_routing_buffer(buffer):
|
||||
"""
|
||||
Allow shuffling calls within a certain latency buffer
|
||||
"""
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-turbo",
|
||||
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||
"rpm": 1440,
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-35-turbo",
|
||||
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
|
||||
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
|
||||
"rpm": 6,
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
},
|
||||
]
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
routing_strategy="latency-based-routing",
|
||||
set_verbose=False,
|
||||
num_retries=3,
|
||||
routing_strategy_args={"lowest_latency_buffer": buffer},
|
||||
) # type: ignore
|
||||
|
||||
## DEPLOYMENT 1 ##
|
||||
deployment_id = 1
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": "azure-model",
|
||||
},
|
||||
"model_info": {"id": 1},
|
||||
}
|
||||
}
|
||||
start_time = time.time()
|
||||
response_obj = {"usage": {"total_tokens": 50}}
|
||||
time.sleep(3)
|
||||
end_time = time.time()
|
||||
router.lowestlatency_logger.log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
## DEPLOYMENT 2 ##
|
||||
deployment_id = 2
|
||||
kwargs = {
|
||||
"litellm_params": {
|
||||
"metadata": {
|
||||
"model_group": "azure-model",
|
||||
},
|
||||
"model_info": {"id": 2},
|
||||
}
|
||||
}
|
||||
start_time = time.time()
|
||||
response_obj = {"usage": {"total_tokens": 20}}
|
||||
time.sleep(2)
|
||||
end_time = time.time()
|
||||
router.lowestlatency_logger.log_success_event(
|
||||
response_obj=response_obj,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
## CHECK WHAT'S SELECTED ##
|
||||
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
|
||||
selected_deployments = {}
|
||||
for _ in range(50):
|
||||
print(router.get_available_deployment(model="azure-model"))
|
||||
selected_deployments[
|
||||
router.get_available_deployment(model="azure-model")["model_info"]["id"]
|
||||
] = 1
|
||||
|
||||
if buffer == 0:
|
||||
assert len(selected_deployments.keys()) == 1
|
||||
else:
|
||||
assert len(selected_deployments.keys()) == 2
|
||||
|
|
|
@ -81,7 +81,6 @@ def test_async_fallbacks(caplog):
|
|||
# Define the expected log messages
|
||||
# - error request, falling back notice, success notice
|
||||
expected_logs = [
|
||||
"Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
|
||||
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
|
||||
"Falling back to model_group = azure/gpt-3.5-turbo",
|
||||
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
|
||||
|
|
|
@ -766,10 +766,10 @@ def test_usage_based_routing_fallbacks():
|
|||
load_dotenv()
|
||||
|
||||
# Constants for TPM and RPM allocation
|
||||
AZURE_FAST_TPM = 3
|
||||
AZURE_BASIC_TPM = 4
|
||||
OPENAI_TPM = 400
|
||||
ANTHROPIC_TPM = 100000
|
||||
AZURE_FAST_RPM = 3
|
||||
AZURE_BASIC_RPM = 4
|
||||
OPENAI_RPM = 10
|
||||
ANTHROPIC_RPM = 100000
|
||||
|
||||
def get_azure_params(deployment_name: str):
|
||||
params = {
|
||||
|
@ -798,22 +798,26 @@ def test_usage_based_routing_fallbacks():
|
|||
{
|
||||
"model_name": "azure/gpt-4-fast",
|
||||
"litellm_params": get_azure_params("chatgpt-v-2"),
|
||||
"tpm": AZURE_FAST_TPM,
|
||||
"model_info": {"id": 1},
|
||||
"rpm": AZURE_FAST_RPM,
|
||||
},
|
||||
{
|
||||
"model_name": "azure/gpt-4-basic",
|
||||
"litellm_params": get_azure_params("chatgpt-v-2"),
|
||||
"tpm": AZURE_BASIC_TPM,
|
||||
"model_info": {"id": 2},
|
||||
"rpm": AZURE_BASIC_RPM,
|
||||
},
|
||||
{
|
||||
"model_name": "openai-gpt-4",
|
||||
"litellm_params": get_openai_params("gpt-3.5-turbo"),
|
||||
"tpm": OPENAI_TPM,
|
||||
"model_info": {"id": 3},
|
||||
"rpm": OPENAI_RPM,
|
||||
},
|
||||
{
|
||||
"model_name": "anthropic-claude-instant-1.2",
|
||||
"litellm_params": get_anthropic_params("claude-instant-1.2"),
|
||||
"tpm": ANTHROPIC_TPM,
|
||||
"model_info": {"id": 4},
|
||||
"rpm": ANTHROPIC_RPM,
|
||||
},
|
||||
]
|
||||
# litellm.set_verbose=True
|
||||
|
@ -844,10 +848,10 @@ def test_usage_based_routing_fallbacks():
|
|||
mock_response="very nice to meet you",
|
||||
)
|
||||
print("response: ", response)
|
||||
print("response._hidden_params: ", response._hidden_params)
|
||||
print(f"response._hidden_params: {response._hidden_params}")
|
||||
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
|
||||
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
|
||||
assert response._hidden_params["custom_llm_provider"] == "openai"
|
||||
assert response._hidden_params["model_id"] == "1"
|
||||
|
||||
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
|
||||
for i in range(20):
|
||||
|
@ -861,7 +865,7 @@ def test_usage_based_routing_fallbacks():
|
|||
print("response._hidden_params: ", response._hidden_params)
|
||||
if i == 19:
|
||||
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
|
||||
assert response._hidden_params["custom_llm_provider"] == "anthropic"
|
||||
assert response._hidden_params["model_id"] == "4"
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(f"An exception occurred {e}")
|
||||
|
|
|
@ -8130,7 +8130,10 @@ def exception_type(
|
|||
llm_provider="vertex_ai",
|
||||
response=original_exception.response,
|
||||
)
|
||||
elif "None Unknown Error." in error_str:
|
||||
elif (
|
||||
"None Unknown Error." in error_str
|
||||
or "Content has no parts." in error_str
|
||||
):
|
||||
exception_mapping_worked = True
|
||||
raise APIError(
|
||||
message=f"VertexAIException - {error_str}",
|
||||
|
|
|
@ -15,7 +15,13 @@ import {
|
|||
Grid,
|
||||
Button,
|
||||
TextInput,
|
||||
Select as Select2,
|
||||
SelectItem,
|
||||
Col,
|
||||
Accordion,
|
||||
AccordionBody,
|
||||
AccordionHeader,
|
||||
AccordionList,
|
||||
} from "@tremor/react";
|
||||
import { TabPanel, TabPanels, TabGroup, TabList, Tab, Icon } from "@tremor/react";
|
||||
import { getCallbacksCall, setCallbacksCall, serviceHealthCheck } from "./networking";
|
||||
|
@ -24,6 +30,7 @@ import { InformationCircleIcon, PencilAltIcon, PencilIcon, StatusOnlineIcon, Tra
|
|||
import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
|
||||
import AddFallbacks from "./add_fallbacks"
|
||||
import openai from "openai";
|
||||
import Paragraph from "antd/es/skeleton/Paragraph";
|
||||
|
||||
interface GeneralSettingsPageProps {
|
||||
accessToken: string | null;
|
||||
|
@ -72,6 +79,62 @@ async function testFallbackModelResponse(
|
|||
}
|
||||
}
|
||||
|
||||
interface AccordionHeroProps {
|
||||
selectedStrategy: string | null;
|
||||
strategyArgs: routingStrategyArgs;
|
||||
paramExplanation: { [key: string]: string }
|
||||
}
|
||||
|
||||
interface routingStrategyArgs {
|
||||
ttl?: number;
|
||||
lowest_latency_buffer?: number;
|
||||
}
|
||||
|
||||
const defaultLowestLatencyArgs: routingStrategyArgs = {
|
||||
"ttl": 3600,
|
||||
"lowest_latency_buffer": 0
|
||||
}
|
||||
|
||||
export const AccordionHero: React.FC<AccordionHeroProps> = ({ selectedStrategy, strategyArgs, paramExplanation }) => (
|
||||
<Accordion>
|
||||
<AccordionHeader className="text-sm font-medium text-tremor-content-strong dark:text-dark-tremor-content-strong">Routing Strategy Specific Args</AccordionHeader>
|
||||
<AccordionBody>
|
||||
{
|
||||
selectedStrategy == "latency-based-routing" ?
|
||||
<Card>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableHeaderCell>Setting</TableHeaderCell>
|
||||
<TableHeaderCell>Value</TableHeaderCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{Object.entries(strategyArgs).map(([param, value]) => (
|
||||
<TableRow key={param}>
|
||||
<TableCell>
|
||||
<Text>{param}</Text>
|
||||
<p style={{fontSize: '0.65rem', color: '#808080', fontStyle: 'italic'}} className="mt-1">{paramExplanation[param]}</p>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextInput
|
||||
name={param}
|
||||
defaultValue={
|
||||
typeof value === 'object' ? JSON.stringify(value, null, 2) : value.toString()
|
||||
}
|
||||
/>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</Card>
|
||||
: <Text>No specific settings</Text>
|
||||
}
|
||||
</AccordionBody>
|
||||
</Accordion>
|
||||
);
|
||||
|
||||
const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
||||
accessToken,
|
||||
userRole,
|
||||
|
@ -82,6 +145,8 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
const [isModalVisible, setIsModalVisible] = useState(false);
|
||||
const [form] = Form.useForm();
|
||||
const [selectedCallback, setSelectedCallback] = useState<string | null>(null);
|
||||
const [selectedStrategy, setSelectedStrategy] = useState<string | null>(null)
|
||||
const [strategySettings, setStrategySettings] = useState<routingStrategyArgs | null>(null);
|
||||
|
||||
let paramExplanation: { [key: string]: string } = {
|
||||
"routing_strategy_args": "(dict) Arguments to pass to the routing strategy",
|
||||
|
@ -91,6 +156,8 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
"num_retries": "(int) Number of retries for failed requests. Defaults to 0.",
|
||||
"timeout": "(float) Timeout for requests. Defaults to None.",
|
||||
"retry_after": "(int) Minimum time to wait before retrying a failed request",
|
||||
"ttl": "(int) Sliding window to look back over when calculating the average latency of a deployment. Default - 1 hour (in seconds).",
|
||||
"lowest_latency_buffer": "(float) Shuffle between deployments within this % of the lowest latency. Default - 0 (i.e. always pick lowest latency)."
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
|
@ -141,6 +208,7 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
try {
|
||||
await setCallbacksCall(accessToken, payload);
|
||||
setRouterSettings({ ...routerSettings });
|
||||
setSelectedStrategy(routerSettings["routing_strategy"])
|
||||
message.success("Router settings updated successfully");
|
||||
} catch (error) {
|
||||
message.error("Failed to update router settings: " + error, 20);
|
||||
|
@ -156,11 +224,33 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
|
||||
const updatedVariables = Object.fromEntries(
|
||||
Object.entries(router_settings).map(([key, value]) => {
|
||||
if (key !== 'routing_strategy_args') {
|
||||
if (key !== 'routing_strategy_args' && key !== "routing_strategy") {
|
||||
return [key, (document.querySelector(`input[name="${key}"]`) as HTMLInputElement)?.value || value];
|
||||
}
|
||||
else if (key == "routing_strategy") {
|
||||
return [key, selectedStrategy]
|
||||
}
|
||||
else if (key == "routing_strategy_args" && selectedStrategy == "latency-based-routing") {
|
||||
let setRoutingStrategyArgs: routingStrategyArgs = {}
|
||||
|
||||
const lowestLatencyBufferElement = document.querySelector(`input[name="lowest_latency_buffer"]`) as HTMLInputElement;
|
||||
const ttlElement = document.querySelector(`input[name="ttl"]`) as HTMLInputElement;
|
||||
|
||||
if (lowestLatencyBufferElement?.value) {
|
||||
setRoutingStrategyArgs["lowest_latency_buffer"] = Number(lowestLatencyBufferElement.value)
|
||||
}
|
||||
|
||||
if (ttlElement?.value) {
|
||||
setRoutingStrategyArgs["ttl"] = Number(ttlElement.value)
|
||||
}
|
||||
|
||||
console.log(`setRoutingStrategyArgs: ${setRoutingStrategyArgs}`)
|
||||
return [
|
||||
"routing_strategy_args", setRoutingStrategyArgs
|
||||
]
|
||||
}
|
||||
return null;
|
||||
}).filter(entry => entry !== null) as Iterable<[string, unknown]>
|
||||
}).filter(entry => entry !== null && entry !== undefined) as Iterable<[string, unknown]>
|
||||
);
|
||||
console.log("updatedVariables", updatedVariables);
|
||||
|
||||
|
@ -183,6 +273,7 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
return null;
|
||||
}
|
||||
|
||||
|
||||
return (
|
||||
<div className="w-full mx-4">
|
||||
<TabGroup className="gap-2 p-8 h-[75vh] w-full mt-2">
|
||||
|
@ -203,24 +294,41 @@ const GeneralSettings: React.FC<GeneralSettingsPageProps> = ({
|
|||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{Object.entries(routerSettings).filter(([param, value]) => param != "fallbacks" && param != "context_window_fallbacks").map(([param, value]) => (
|
||||
{Object.entries(routerSettings).filter(([param, value]) => param != "fallbacks" && param != "context_window_fallbacks" && param != "routing_strategy_args").map(([param, value]) => (
|
||||
<TableRow key={param}>
|
||||
<TableCell>
|
||||
<Text>{param}</Text>
|
||||
<p style={{fontSize: '0.65rem', color: '#808080', fontStyle: 'italic'}} className="mt-1">{paramExplanation[param]}</p>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextInput
|
||||
name={param}
|
||||
defaultValue={
|
||||
typeof value === 'object' ? JSON.stringify(value, null, 2) : value.toString()
|
||||
}
|
||||
/>
|
||||
{
|
||||
param == "routing_strategy" ?
|
||||
<Select2 defaultValue={value} className="w-full max-w-md" onValueChange={setSelectedStrategy}>
|
||||
<SelectItem value="usage-based-routing">usage-based-routing</SelectItem>
|
||||
<SelectItem value="latency-based-routing">latency-based-routing</SelectItem>
|
||||
<SelectItem value="simple-shuffle">simple-shuffle</SelectItem>
|
||||
</Select2> :
|
||||
<TextInput
|
||||
name={param}
|
||||
defaultValue={
|
||||
typeof value === 'object' ? JSON.stringify(value, null, 2) : value.toString()
|
||||
}
|
||||
/>
|
||||
}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
<AccordionHero
|
||||
selectedStrategy={selectedStrategy}
|
||||
strategyArgs={
|
||||
routerSettings && routerSettings['routing_strategy_args'] && Object.keys(routerSettings['routing_strategy_args']).length > 0
|
||||
? routerSettings['routing_strategy_args']
|
||||
: defaultLowestLatencyArgs // default value when keys length is 0
|
||||
}
|
||||
paramExplanation={paramExplanation}
|
||||
/>
|
||||
</Card>
|
||||
<Col>
|
||||
<Button className="mt-2" onClick={() => handleSaveChanges(routerSettings)}>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue