forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_organization_table
This commit is contained in:
commit
eaccbf26b7
10 changed files with 146 additions and 75 deletions
|
@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
|||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra",
|
||||
"max_budget": 10,
|
||||
"soft_budget": 5,
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -93,6 +94,7 @@ Request Params:
|
|||
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
|
||||
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
|
||||
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||
|
|
|
@ -79,6 +79,9 @@ max_budget: float = 0.0 # set the max budget across all providers
|
|||
budget_duration: Optional[str] = (
|
||||
None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
)
|
||||
default_soft_budget: float = (
|
||||
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
||||
)
|
||||
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
|
||||
_openai_completion_params = [
|
||||
"functions",
|
||||
|
|
|
@ -151,6 +151,7 @@ class GenerateRequestBase(LiteLLMBase):
|
|||
rpm_limit: Optional[int] = None
|
||||
budget_duration: Optional[str] = None
|
||||
allowed_cache_controls: Optional[list] = []
|
||||
soft_budget: Optional[float] = None
|
||||
|
||||
|
||||
class GenerateKeyRequest(GenerateRequestBase):
|
||||
|
@ -327,7 +328,7 @@ class TeamRequest(LiteLLMBase):
|
|||
|
||||
class LiteLLM_BudgetTable(LiteLLMBase):
|
||||
"""Represents user-controllable params for a LiteLLM_BudgetTable record"""
|
||||
|
||||
soft_budget: Optional[float] = None
|
||||
max_budget: Optional[float] = None
|
||||
max_parallel_requests: Optional[int] = None
|
||||
tpm_limit: Optional[int] = None
|
||||
|
@ -585,6 +586,7 @@ class LiteLLM_SpendLogs(LiteLLMBase):
|
|||
request_id: str
|
||||
api_key: str
|
||||
model: Optional[str] = ""
|
||||
api_base: Optional[str] = ""
|
||||
call_type: str
|
||||
spend: Optional[float] = 0.0
|
||||
total_tokens: Optional[int] = 0
|
||||
|
|
|
@ -791,6 +791,7 @@ async def user_api_key_auth(
|
|||
"/global/spend/keys",
|
||||
"/global/spend/models",
|
||||
"/global/predict/spend/logs",
|
||||
"/health/services",
|
||||
]
|
||||
# check if the current route startswith any of the allowed routes
|
||||
if (
|
||||
|
@ -1814,6 +1815,9 @@ async def generate_key_helper_fn(
|
|||
spend: float,
|
||||
key_max_budget: Optional[float] = None, # key_max_budget is used to Budget Per key
|
||||
key_budget_duration: Optional[str] = None,
|
||||
key_soft_budget: Optional[
|
||||
float
|
||||
] = None, # key_soft_budget is used to Budget Per key
|
||||
max_budget: Optional[float] = None, # max_budget is used to Budget Per user
|
||||
budget_duration: Optional[str] = None, # max_budget is used to Budget Per user
|
||||
token: Optional[str] = None,
|
||||
|
@ -1873,6 +1877,19 @@ async def generate_key_helper_fn(
|
|||
rpm_limit = rpm_limit
|
||||
allowed_cache_controls = allowed_cache_controls
|
||||
|
||||
# TODO: @ishaan-jaff: Migrate all budget tracking to use LiteLLM_BudgetTable
|
||||
if prisma_client is not None:
|
||||
# create the Budget Row for the LiteLLM Verification Token
|
||||
budget_row = LiteLLM_BudgetTable(
|
||||
soft_budget=key_soft_budget or litellm.default_soft_budget,
|
||||
model_max_budget=model_max_budget or {},
|
||||
created_by=user_id,
|
||||
updated_by=user_id,
|
||||
)
|
||||
new_budget = prisma_client.jsonify_object(budget_row.json(exclude_none=True))
|
||||
_budget = await prisma_client.db.litellm_budgettable.create(data={**new_budget}) # type: ignore
|
||||
_budget_id = getattr(_budget, "id", None)
|
||||
|
||||
try:
|
||||
# Create a new verification token (you may want to enhance this logic based on your needs)
|
||||
user_data = {
|
||||
|
@ -1910,6 +1927,7 @@ async def generate_key_helper_fn(
|
|||
"allowed_cache_controls": allowed_cache_controls,
|
||||
"permissions": permissions_json,
|
||||
"model_max_budget": model_max_budget_json,
|
||||
"budget_id": _budget_id,
|
||||
}
|
||||
if (
|
||||
general_settings.get("allow_user_auth", False) == True
|
||||
|
@ -1982,6 +2000,9 @@ async def generate_key_helper_fn(
|
|||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
# Add budget related info in key_data - this ensures it's returned
|
||||
key_data["soft_budget"] = key_soft_budget
|
||||
return key_data
|
||||
|
||||
|
||||
|
@ -2142,14 +2163,6 @@ async def async_data_generator(response, user_api_key_dict):
|
|||
except Exception as e:
|
||||
yield f"data: {str(e)}\n\n"
|
||||
|
||||
### ALERTING ###
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
# Streaming is done, yield the [DONE] chunk
|
||||
done_message = "[DONE]"
|
||||
yield f"data: {done_message}\n\n"
|
||||
|
@ -2497,14 +2510,6 @@ async def completion(
|
|||
headers=custom_headers,
|
||||
)
|
||||
|
||||
### ALERTING ###
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
fastapi_response.headers["x-litellm-model-id"] = model_id
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -2703,14 +2708,6 @@ async def chat_completion(
|
|||
headers=custom_headers,
|
||||
)
|
||||
|
||||
### ALERTING ###
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
fastapi_response.headers["x-litellm-model-id"] = model_id
|
||||
|
||||
### CALL HOOKS ### - modify outgoing data
|
||||
|
@ -2918,12 +2915,6 @@ async def embeddings(
|
|||
|
||||
### ALERTING ###
|
||||
data["litellm_status"] = "success" # used for alerting
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -3069,12 +3060,6 @@ async def image_generation(
|
|||
|
||||
### ALERTING ###
|
||||
data["litellm_status"] = "success" # used for alerting
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -3228,12 +3213,6 @@ async def moderations(
|
|||
|
||||
### ALERTING ###
|
||||
data["litellm_status"] = "success" # used for alerting
|
||||
end_time = time.time()
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.response_taking_too_long(
|
||||
start_time=start_time, end_time=end_time, type="slow_response"
|
||||
)
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -3378,6 +3357,8 @@ async def generate_key_fn(
|
|||
# if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
|
||||
if "max_budget" in data_json:
|
||||
data_json["key_max_budget"] = data_json.pop("max_budget", None)
|
||||
if "soft_budget" in data_json:
|
||||
data_json["key_soft_budget"] = data_json.pop("soft_budget", None)
|
||||
|
||||
if "budget_duration" in data_json:
|
||||
data_json["key_budget_duration"] = data_json.pop("budget_duration", None)
|
||||
|
@ -6722,6 +6703,50 @@ async def test_endpoint(request: Request):
|
|||
return {"route": request.url.path}
|
||||
|
||||
|
||||
@router.get(
|
||||
"/health/services",
|
||||
tags=["health"],
|
||||
dependencies=[Depends(user_api_key_auth)],
|
||||
include_in_schema=False,
|
||||
)
|
||||
async def health_services_endpoint(
|
||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
service: Literal["slack_budget_alerts"] = fastapi.Query(
|
||||
description="Specify the service being hit."
|
||||
),
|
||||
):
|
||||
"""
|
||||
Hidden endpoint.
|
||||
|
||||
Used by the UI to let user check if slack alerting is working as expected.
|
||||
"""
|
||||
global general_settings, proxy_logging_obj
|
||||
|
||||
if service is None:
|
||||
raise HTTPException(
|
||||
status_code=400, detail={"error": "Service must be specified."}
|
||||
)
|
||||
|
||||
if service not in ["slack_budget_alerts"]:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": f"Service must be in list. Service={service}. List={['slack_budget_alerts']}"
|
||||
},
|
||||
)
|
||||
|
||||
test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` my-secret-project \n`Expected Day of Error`: 28th March \n`Current Spend`: 100 \n`Projected Spend at end of month`: 1000 \n
|
||||
"""
|
||||
|
||||
if "slack" in general_settings.get("alerting", []):
|
||||
await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail={"error": "No slack connection setup. Unable to test this."},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
|
||||
async def health_endpoint(
|
||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
|
|
|
@ -11,10 +11,11 @@ generator client {
|
|||
model LiteLLM_BudgetTable {
|
||||
budget_id String @id @default(uuid())
|
||||
max_budget Float?
|
||||
soft_budget Float?
|
||||
max_parallel_requests Int?
|
||||
tpm_limit BigInt?
|
||||
rpm_limit BigInt?
|
||||
model_max_budget Json @default("{}")
|
||||
model_max_budget Json?
|
||||
budget_duration String?
|
||||
budget_reset_at DateTime?
|
||||
created_at DateTime @default(now()) @map("created_at")
|
||||
|
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
|
|||
allowed_cache_controls String[] @default([])
|
||||
model_spend Json @default("{}")
|
||||
model_max_budget Json @default("{}")
|
||||
budget_id String?
|
||||
}
|
||||
|
||||
// store proxy config.yaml
|
||||
|
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
|
|||
startTime DateTime // Assuming start_time is a DateTime field
|
||||
endTime DateTime // Assuming end_time is a DateTime field
|
||||
model String @default("")
|
||||
api_base String @default("")
|
||||
user String @default("")
|
||||
metadata Json @default("{}")
|
||||
cache_hit String @default("")
|
||||
|
|
|
@ -64,6 +64,7 @@ class ProxyLogging:
|
|||
litellm.callbacks.append(self.max_parallel_request_limiter)
|
||||
litellm.callbacks.append(self.max_budget_limiter)
|
||||
litellm.callbacks.append(self.cache_control_check)
|
||||
litellm.callbacks.append(self.response_taking_too_long_callback)
|
||||
for callback in litellm.callbacks:
|
||||
if callback not in litellm.input_callback:
|
||||
litellm.input_callback.append(callback)
|
||||
|
@ -142,6 +143,30 @@ class ProxyLogging:
|
|||
raise e
|
||||
return data
|
||||
|
||||
async def response_taking_too_long_callback(
|
||||
self,
|
||||
kwargs, # kwargs to completion
|
||||
completion_response, # response from completion
|
||||
start_time,
|
||||
end_time, # start/end time
|
||||
):
|
||||
if self.alerting is None:
|
||||
return
|
||||
time_difference = end_time - start_time
|
||||
# Convert the timedelta to float (in seconds)
|
||||
time_difference_float = time_difference.total_seconds()
|
||||
litellm_params = kwargs.get("litellm_params", {})
|
||||
api_base = litellm_params.get("api_base", "")
|
||||
model = kwargs.get("model", "")
|
||||
messages = kwargs.get("messages", "")
|
||||
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
|
||||
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
||||
if time_difference_float > self.alerting_threshold:
|
||||
await self.alerting_handler(
|
||||
message=slow_message + request_info,
|
||||
level="Low",
|
||||
)
|
||||
|
||||
async def response_taking_too_long(
|
||||
self,
|
||||
start_time: Optional[float] = None,
|
||||
|
@ -189,16 +214,6 @@ class ProxyLogging:
|
|||
level="Medium",
|
||||
)
|
||||
|
||||
elif (
|
||||
type == "slow_response" and start_time is not None and end_time is not None
|
||||
):
|
||||
slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
||||
if end_time - start_time > self.alerting_threshold:
|
||||
await self.alerting_handler(
|
||||
message=slow_message + request_info,
|
||||
level="Low",
|
||||
)
|
||||
|
||||
async def budget_alerts(
|
||||
self,
|
||||
type: Literal[
|
||||
|
@ -1585,6 +1600,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
|||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"request_tags": metadata.get("tags", []),
|
||||
"end_user": kwargs.get("user", ""),
|
||||
"api_base": litellm_params.get("api_base", ""),
|
||||
}
|
||||
|
||||
verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.28.8"
|
||||
version = "1.28.9"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.28.8"
|
||||
version = "1.28.9"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
|
@ -11,6 +11,7 @@ generator client {
|
|||
model LiteLLM_BudgetTable {
|
||||
budget_id String @id @default(uuid())
|
||||
max_budget Float?
|
||||
soft_budget Float?
|
||||
max_parallel_requests Int?
|
||||
tpm_limit BigInt?
|
||||
rpm_limit BigInt?
|
||||
|
@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
|
|||
allowed_cache_controls String[] @default([])
|
||||
model_spend Json @default("{}")
|
||||
model_max_budget Json @default("{}")
|
||||
budget_id String?
|
||||
}
|
||||
|
||||
// store proxy config.yaml
|
||||
|
@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
|
|||
startTime DateTime // Assuming start_time is a DateTime field
|
||||
endTime DateTime // Assuming end_time is a DateTime field
|
||||
model String @default("")
|
||||
api_base String @default("")
|
||||
user String @default("")
|
||||
metadata Json @default("{}")
|
||||
cache_hit String @default("")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import React, { useState, useEffect, useRef } from "react";
|
||||
import { Button, TextInput, Grid, Col } from "@tremor/react";
|
||||
import { Card, Metric, Text } from "@tremor/react";
|
||||
import { Card, Metric, Text, Title, Subtitle } from "@tremor/react";
|
||||
import {
|
||||
Button as Button2,
|
||||
Modal,
|
||||
|
@ -38,6 +38,7 @@ const CreateKey: React.FC<CreateKeyProps> = ({
|
|||
const [form] = Form.useForm();
|
||||
const [isModalVisible, setIsModalVisible] = useState(false);
|
||||
const [apiKey, setApiKey] = useState(null);
|
||||
const [softBudget, setSoftBudget] = useState(null);
|
||||
const handleOk = () => {
|
||||
setIsModalVisible(false);
|
||||
form.resetFields();
|
||||
|
@ -54,8 +55,11 @@ const CreateKey: React.FC<CreateKeyProps> = ({
|
|||
message.info("Making API Call");
|
||||
setIsModalVisible(true);
|
||||
const response = await keyCreateCall(accessToken, userID, formValues);
|
||||
|
||||
console.log("key create Response:", response);
|
||||
setData((prevData) => (prevData ? [...prevData, response] : [response])); // Check if prevData is null
|
||||
setApiKey(response["key"]);
|
||||
setSoftBudget(response["soft_budget"]);
|
||||
message.success("API Key Created");
|
||||
form.resetFields();
|
||||
localStorage.removeItem("userData" + userID);
|
||||
|
@ -108,6 +112,9 @@ const CreateKey: React.FC<CreateKeyProps> = ({
|
|||
))}
|
||||
</Select>
|
||||
</Form.Item>
|
||||
<Form.Item label="Soft Budget (USD) Monthly" name="soft_budget" initialValue={50.00}>
|
||||
<InputNumber step={0.01} precision={2} defaultValue={50.00} width={200} />
|
||||
</Form.Item>
|
||||
<Form.Item label="Max Budget (USD)" name="max_budget">
|
||||
<InputNumber step={0.01} precision={2} width={200} />
|
||||
</Form.Item>
|
||||
|
@ -154,13 +161,14 @@ const CreateKey: React.FC<CreateKeyProps> = ({
|
|||
</Modal>
|
||||
{apiKey && (
|
||||
<Modal
|
||||
title="Save your key"
|
||||
visible={isModalVisible}
|
||||
onOk={handleOk}
|
||||
onCancel={handleCancel}
|
||||
footer={null}
|
||||
>
|
||||
<Grid numItems={1} className="gap-2 w-full">
|
||||
<Card>
|
||||
<Title>Save your Key</Title>
|
||||
<Col numColSpan={1}>
|
||||
<p>
|
||||
Please save this secret key somewhere safe and accessible. For
|
||||
|
@ -171,11 +179,20 @@ const CreateKey: React.FC<CreateKeyProps> = ({
|
|||
</Col>
|
||||
<Col numColSpan={1}>
|
||||
{apiKey != null ? (
|
||||
<div>
|
||||
<Text>API Key: {apiKey}</Text>
|
||||
<Title className="mt-6">Budgets</Title>
|
||||
<Text>Soft Limit Budget: ${softBudget}</Text>
|
||||
<Button className="mt-3">
|
||||
Test Alert
|
||||
</Button>
|
||||
|
||||
</div>
|
||||
) : (
|
||||
<Text>Key being created, this might take 30s</Text>
|
||||
)}
|
||||
</Col>
|
||||
</Card>
|
||||
</Grid>
|
||||
</Modal>
|
||||
)}
|
||||
|
|
|
@ -105,7 +105,7 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({
|
|||
|
||||
return (
|
||||
<div>
|
||||
<Button size = "xs" onClick={showModal}>
|
||||
<Button size = "xs" onClick={showModal} variant="secondary">
|
||||
View Spend Report
|
||||
</Button>
|
||||
<Modal
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue