mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
docs(team_budgets.md): fix script
/
This commit is contained in:
parent
fc4e900a23
commit
bae7377128
2 changed files with 19 additions and 8 deletions
|
@ -154,7 +154,9 @@ litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-
|
|||
|
||||
### Dynamic TPM Allocation
|
||||
|
||||
Prevent teams from gobbling too much quota.
|
||||
Prevent projects from gobbling too much quota.
|
||||
|
||||
Dynamically allocate TPM quota to api keys, based on active keys in that minute.
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
|
@ -190,6 +192,12 @@ litellm --config /path/to/config.yaml
|
|||
- Mock response returns 30 total tokens / request
|
||||
- Each team will only be able to make 1 request per minute
|
||||
"""
|
||||
"""
|
||||
- Run 2 concurrent teams calling same model
|
||||
- model has 60 TPM
|
||||
- Mock response returns 30 total tokens / request
|
||||
- Each team will only be able to make 1 request per minute
|
||||
"""
|
||||
import requests
|
||||
from openai import OpenAI, RateLimitError
|
||||
|
||||
|
@ -204,7 +212,6 @@ def create_key(api_key: str, base_url: str):
|
|||
|
||||
_response = response.json()
|
||||
|
||||
print(f"_response: {_response}")
|
||||
return _response["key"]
|
||||
|
||||
key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000")
|
||||
|
@ -217,19 +224,19 @@ response = openai_client_1.chat.completions.with_raw_response.create(
|
|||
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
||||
)
|
||||
|
||||
print("Headers for call - {}".format(response.headers))
|
||||
print("Headers for call 1 - {}".format(response.headers))
|
||||
_response = response.parse()
|
||||
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
||||
|
||||
|
||||
# call proxy with key 2 - works
|
||||
openai_client_2 = OpenAI(api_key=key_1, base_url="http://0.0.0.0:4000")
|
||||
openai_client_2 = OpenAI(api_key=key_2, base_url="http://0.0.0.0:4000")
|
||||
|
||||
response = openai_client_2.chat.completions.with_raw_response.create(
|
||||
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
||||
)
|
||||
|
||||
print("Headers for call - {}".format(response.headers))
|
||||
print("Headers for call 2 - {}".format(response.headers))
|
||||
_response = response.parse()
|
||||
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
||||
# call proxy with key 2 - fails
|
||||
|
@ -239,6 +246,10 @@ try:
|
|||
except RateLimitError as e:
|
||||
print("This was rate limited b/c - {}".format(str(e)))
|
||||
|
||||
```
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```
|
||||
This was rate limited b/c - Error code: 429 - {'error': {'message': {'error': 'Key=<hashed_token> over available TPM=0. Model TPM=0, Active keys=2'}, 'type': 'None', 'param': 'None', 'code': 429}}
|
||||
```
|
|
@ -151,8 +151,8 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
|
|||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail={
|
||||
"error": "Team={} over available TPM={}. Model TPM={}, Active teams={}".format(
|
||||
user_api_key_dict.team_id,
|
||||
"error": "Key={} over available TPM={}. Model TPM={}, Active keys={}".format(
|
||||
user_api_key_dict.api_key,
|
||||
available_tpm,
|
||||
model_tpm,
|
||||
active_projects,
|
||||
|
@ -164,7 +164,7 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
|
|||
asyncio.create_task(
|
||||
self.internal_usage_cache.async_set_cache_sadd( # this is a set
|
||||
model=data["model"], # type: ignore
|
||||
value=[user_api_key_dict.team_id or "default_team"],
|
||||
value=[user_api_key_dict.token or "default_key"],
|
||||
)
|
||||
)
|
||||
return None
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue