Merge branch 'main' into litellm_organization_table

This commit is contained in:
Krish Dholakia 2024-03-02 16:09:28 -08:00 committed by GitHub
commit eaccbf26b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 146 additions and 75 deletions

View file

@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
"soft_budget": 5,
}'
```
@ -93,6 +94,7 @@ Request Params:
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }

View file

@ -79,6 +79,9 @@ max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[str] = (
None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
)
default_soft_budget: float = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0
)
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
_openai_completion_params = [
"functions",

View file

@ -151,6 +151,7 @@ class GenerateRequestBase(LiteLLMBase):
rpm_limit: Optional[int] = None
budget_duration: Optional[str] = None
allowed_cache_controls: Optional[list] = []
soft_budget: Optional[float] = None
class GenerateKeyRequest(GenerateRequestBase):
@ -327,7 +328,7 @@ class TeamRequest(LiteLLMBase):
class LiteLLM_BudgetTable(LiteLLMBase):
"""Represents user-controllable params for a LiteLLM_BudgetTable record"""
soft_budget: Optional[float] = None
max_budget: Optional[float] = None
max_parallel_requests: Optional[int] = None
tpm_limit: Optional[int] = None
@ -366,7 +367,7 @@ class OrganizationRequest(LiteLLMBase):
class BudgetRequest(LiteLLMBase):
budgets: List[str]
class KeyManagementSystem(enum.Enum):
GOOGLE_KMS = "google_kms"
AZURE_KEY_VAULT = "azure_key_vault"
@ -585,6 +586,7 @@ class LiteLLM_SpendLogs(LiteLLMBase):
request_id: str
api_key: str
model: Optional[str] = ""
api_base: Optional[str] = ""
call_type: str
spend: Optional[float] = 0.0
total_tokens: Optional[int] = 0

View file

@ -791,6 +791,7 @@ async def user_api_key_auth(
"/global/spend/keys",
"/global/spend/models",
"/global/predict/spend/logs",
"/health/services",
]
# check if the current route startswith any of the allowed routes
if (
@ -1814,6 +1815,9 @@ async def generate_key_helper_fn(
spend: float,
key_max_budget: Optional[float] = None, # key_max_budget is used to Budget Per key
key_budget_duration: Optional[str] = None,
key_soft_budget: Optional[
float
] = None, # key_soft_budget is used to Budget Per key
max_budget: Optional[float] = None, # max_budget is used to Budget Per user
budget_duration: Optional[str] = None, # max_budget is used to Budget Per user
token: Optional[str] = None,
@ -1873,6 +1877,19 @@ async def generate_key_helper_fn(
rpm_limit = rpm_limit
allowed_cache_controls = allowed_cache_controls
# TODO: @ishaan-jaff: Migrate all budget tracking to use LiteLLM_BudgetTable
if prisma_client is not None:
# create the Budget Row for the LiteLLM Verification Token
budget_row = LiteLLM_BudgetTable(
soft_budget=key_soft_budget or litellm.default_soft_budget,
model_max_budget=model_max_budget or {},
created_by=user_id,
updated_by=user_id,
)
new_budget = prisma_client.jsonify_object(budget_row.json(exclude_none=True))
_budget = await prisma_client.db.litellm_budgettable.create(data={**new_budget}) # type: ignore
_budget_id = getattr(_budget, "id", None)
try:
# Create a new verification token (you may want to enhance this logic based on your needs)
user_data = {
@ -1910,6 +1927,7 @@ async def generate_key_helper_fn(
"allowed_cache_controls": allowed_cache_controls,
"permissions": permissions_json,
"model_max_budget": model_max_budget_json,
"budget_id": _budget_id,
}
if (
general_settings.get("allow_user_auth", False) == True
@ -1982,6 +2000,9 @@ async def generate_key_helper_fn(
except Exception as e:
traceback.print_exc()
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
# Add budget related info in key_data - this ensures it's returned
key_data["soft_budget"] = key_soft_budget
return key_data
@ -2142,14 +2163,6 @@ async def async_data_generator(response, user_api_key_dict):
except Exception as e:
yield f"data: {str(e)}\n\n"
### ALERTING ###
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
# Streaming is done, yield the [DONE] chunk
done_message = "[DONE]"
yield f"data: {done_message}\n\n"
@ -2497,14 +2510,6 @@ async def completion(
headers=custom_headers,
)
### ALERTING ###
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
fastapi_response.headers["x-litellm-model-id"] = model_id
return response
except Exception as e:
@ -2703,14 +2708,6 @@ async def chat_completion(
headers=custom_headers,
)
### ALERTING ###
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
fastapi_response.headers["x-litellm-model-id"] = model_id
### CALL HOOKS ### - modify outgoing data
@ -2918,12 +2915,6 @@ async def embeddings(
### ALERTING ###
data["litellm_status"] = "success" # used for alerting
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
return response
except Exception as e:
@ -3069,12 +3060,6 @@ async def image_generation(
### ALERTING ###
data["litellm_status"] = "success" # used for alerting
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
return response
except Exception as e:
@ -3228,12 +3213,6 @@ async def moderations(
### ALERTING ###
data["litellm_status"] = "success" # used for alerting
end_time = time.time()
asyncio.create_task(
proxy_logging_obj.response_taking_too_long(
start_time=start_time, end_time=end_time, type="slow_response"
)
)
return response
except Exception as e:
@ -3378,6 +3357,8 @@ async def generate_key_fn(
# if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
if "max_budget" in data_json:
data_json["key_max_budget"] = data_json.pop("max_budget", None)
if "soft_budget" in data_json:
data_json["key_soft_budget"] = data_json.pop("soft_budget", None)
if "budget_duration" in data_json:
data_json["key_budget_duration"] = data_json.pop("budget_duration", None)
@ -6722,6 +6703,50 @@ async def test_endpoint(request: Request):
return {"route": request.url.path}
@router.get(
"/health/services",
tags=["health"],
dependencies=[Depends(user_api_key_auth)],
include_in_schema=False,
)
async def health_services_endpoint(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
service: Literal["slack_budget_alerts"] = fastapi.Query(
description="Specify the service being hit."
),
):
"""
Hidden endpoint.
Used by the UI to let user check if slack alerting is working as expected.
"""
global general_settings, proxy_logging_obj
if service is None:
raise HTTPException(
status_code=400, detail={"error": "Service must be specified."}
)
if service not in ["slack_budget_alerts"]:
raise HTTPException(
status_code=400,
detail={
"error": f"Service must be in list. Service={service}. List={['slack_budget_alerts']}"
},
)
test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` my-secret-project \n`Expected Day of Error`: 28th March \n`Current Spend`: 100 \n`Projected Spend at end of month`: 1000 \n
"""
if "slack" in general_settings.get("alerting", []):
await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
else:
raise HTTPException(
status_code=422,
detail={"error": "No slack connection setup. Unable to test this."},
)
@router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
async def health_endpoint(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),

View file

@ -11,10 +11,11 @@ generator client {
model LiteLLM_BudgetTable {
budget_id String @id @default(uuid())
max_budget Float?
soft_budget Float?
max_parallel_requests Int?
tpm_limit BigInt?
rpm_limit BigInt?
model_max_budget Json @default("{}")
model_max_budget Json?
budget_duration String?
budget_reset_at DateTime?
created_at DateTime @default(now()) @map("created_at")
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
allowed_cache_controls String[] @default([])
model_spend Json @default("{}")
model_max_budget Json @default("{}")
budget_id String?
}
// store proxy config.yaml
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
model String @default("")
api_base String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")

View file

@ -64,6 +64,7 @@ class ProxyLogging:
litellm.callbacks.append(self.max_parallel_request_limiter)
litellm.callbacks.append(self.max_budget_limiter)
litellm.callbacks.append(self.cache_control_check)
litellm.callbacks.append(self.response_taking_too_long_callback)
for callback in litellm.callbacks:
if callback not in litellm.input_callback:
litellm.input_callback.append(callback)
@ -142,6 +143,30 @@ class ProxyLogging:
raise e
return data
async def response_taking_too_long_callback(
self,
kwargs, # kwargs to completion
completion_response, # response from completion
start_time,
end_time, # start/end time
):
if self.alerting is None:
return
time_difference = end_time - start_time
# Convert the timedelta to float (in seconds)
time_difference_float = time_difference.total_seconds()
litellm_params = kwargs.get("litellm_params", {})
api_base = litellm_params.get("api_base", "")
model = kwargs.get("model", "")
messages = kwargs.get("messages", "")
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
if time_difference_float > self.alerting_threshold:
await self.alerting_handler(
message=slow_message + request_info,
level="Low",
)
async def response_taking_too_long(
self,
start_time: Optional[float] = None,
@ -189,16 +214,6 @@ class ProxyLogging:
level="Medium",
)
elif (
type == "slow_response" and start_time is not None and end_time is not None
):
slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
if end_time - start_time > self.alerting_threshold:
await self.alerting_handler(
message=slow_message + request_info,
level="Low",
)
async def budget_alerts(
self,
type: Literal[
@ -1585,6 +1600,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
"completion_tokens": usage.get("completion_tokens", 0),
"request_tags": metadata.get("tags", []),
"end_user": kwargs.get("user", ""),
"api_base": litellm_params.get("api_base", ""),
}
verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.28.8"
version = "1.28.9"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.28.8"
version = "1.28.9"
version_files = [
"pyproject.toml:^version"
]

View file

@ -11,6 +11,7 @@ generator client {
model LiteLLM_BudgetTable {
budget_id String @id @default(uuid())
max_budget Float?
soft_budget Float?
max_parallel_requests Int?
tpm_limit BigInt?
rpm_limit BigInt?
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
allowed_cache_controls String[] @default([])
model_spend Json @default("{}")
model_max_budget Json @default("{}")
budget_id String?
}
// store proxy config.yaml
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
model String @default("")
api_base String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")

View file

@ -2,7 +2,7 @@
import React, { useState, useEffect, useRef } from "react";
import { Button, TextInput, Grid, Col } from "@tremor/react";
import { Card, Metric, Text } from "@tremor/react";
import { Card, Metric, Text, Title, Subtitle } from "@tremor/react";
import {
Button as Button2,
Modal,
@ -38,6 +38,7 @@ const CreateKey: React.FC<CreateKeyProps> = ({
const [form] = Form.useForm();
const [isModalVisible, setIsModalVisible] = useState(false);
const [apiKey, setApiKey] = useState(null);
const [softBudget, setSoftBudget] = useState(null);
const handleOk = () => {
setIsModalVisible(false);
form.resetFields();
@ -54,8 +55,11 @@ const CreateKey: React.FC<CreateKeyProps> = ({
message.info("Making API Call");
setIsModalVisible(true);
const response = await keyCreateCall(accessToken, userID, formValues);
console.log("key create Response:", response);
setData((prevData) => (prevData ? [...prevData, response] : [response])); // Check if prevData is null
setApiKey(response["key"]);
setSoftBudget(response["soft_budget"]);
message.success("API Key Created");
form.resetFields();
localStorage.removeItem("userData" + userID);
@ -108,6 +112,9 @@ const CreateKey: React.FC<CreateKeyProps> = ({
))}
</Select>
</Form.Item>
<Form.Item label="Soft Budget (USD) Monthly" name="soft_budget" initialValue={50.00}>
<InputNumber step={0.01} precision={2} defaultValue={50.00} width={200} />
</Form.Item>
<Form.Item label="Max Budget (USD)" name="max_budget">
<InputNumber step={0.01} precision={2} width={200} />
</Form.Item>
@ -154,28 +161,38 @@ const CreateKey: React.FC<CreateKeyProps> = ({
</Modal>
{apiKey && (
<Modal
title="Save your key"
visible={isModalVisible}
onOk={handleOk}
onCancel={handleCancel}
footer={null}
>
<Grid numItems={1} className="gap-2 w-full">
<Col numColSpan={1}>
<p>
Please save this secret key somewhere safe and accessible. For
security reasons, <b>you will not be able to view it again</b>{" "}
through your LiteLLM account. If you lose this secret key, you
will need to generate a new one.
</p>
</Col>
<Col numColSpan={1}>
{apiKey != null ? (
<Text>API Key: {apiKey}</Text>
) : (
<Text>Key being created, this might take 30s</Text>
)}
</Col>
<Card>
<Title>Save your Key</Title>
<Col numColSpan={1}>
<p>
Please save this secret key somewhere safe and accessible. For
security reasons, <b>you will not be able to view it again</b>{" "}
through your LiteLLM account. If you lose this secret key, you
will need to generate a new one.
</p>
</Col>
<Col numColSpan={1}>
{apiKey != null ? (
<div>
<Text>API Key: {apiKey}</Text>
<Title className="mt-6">Budgets</Title>
<Text>Soft Limit Budget: ${softBudget}</Text>
<Button className="mt-3">
Test Alert
</Button>
</div>
) : (
<Text>Key being created, this might take 30s</Text>
)}
</Col>
</Card>
</Grid>
</Modal>
)}

View file

@ -105,7 +105,7 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({
return (
<div>
<Button size = "xs" onClick={showModal}>
<Button size = "xs" onClick={showModal} variant="secondary">
View Spend Report
</Button>
<Modal