mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
docs(team_budgets.md): fix script
/
This commit is contained in:
parent
fc4e900a23
commit
bae7377128
2 changed files with 19 additions and 8 deletions
|
@ -154,7 +154,9 @@ litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-
|
||||||
|
|
||||||
### Dynamic TPM Allocation
|
### Dynamic TPM Allocation
|
||||||
|
|
||||||
Prevent teams from gobbling too much quota.
|
Prevent projects from gobbling too much quota.
|
||||||
|
|
||||||
|
Dynamically allocate TPM quota to api keys, based on active keys in that minute.
|
||||||
|
|
||||||
1. Setup config.yaml
|
1. Setup config.yaml
|
||||||
|
|
||||||
|
@ -190,6 +192,12 @@ litellm --config /path/to/config.yaml
|
||||||
- Mock response returns 30 total tokens / request
|
- Mock response returns 30 total tokens / request
|
||||||
- Each team will only be able to make 1 request per minute
|
- Each team will only be able to make 1 request per minute
|
||||||
"""
|
"""
|
||||||
|
"""
|
||||||
|
- Run 2 concurrent teams calling same model
|
||||||
|
- model has 60 TPM
|
||||||
|
- Mock response returns 30 total tokens / request
|
||||||
|
- Each team will only be able to make 1 request per minute
|
||||||
|
"""
|
||||||
import requests
|
import requests
|
||||||
from openai import OpenAI, RateLimitError
|
from openai import OpenAI, RateLimitError
|
||||||
|
|
||||||
|
@ -204,7 +212,6 @@ def create_key(api_key: str, base_url: str):
|
||||||
|
|
||||||
_response = response.json()
|
_response = response.json()
|
||||||
|
|
||||||
print(f"_response: {_response}")
|
|
||||||
return _response["key"]
|
return _response["key"]
|
||||||
|
|
||||||
key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000")
|
key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000")
|
||||||
|
@ -217,19 +224,19 @@ response = openai_client_1.chat.completions.with_raw_response.create(
|
||||||
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Headers for call - {}".format(response.headers))
|
print("Headers for call 1 - {}".format(response.headers))
|
||||||
_response = response.parse()
|
_response = response.parse()
|
||||||
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
||||||
|
|
||||||
|
|
||||||
# call proxy with key 2 - works
|
# call proxy with key 2 - works
|
||||||
openai_client_2 = OpenAI(api_key=key_1, base_url="http://0.0.0.0:4000")
|
openai_client_2 = OpenAI(api_key=key_2, base_url="http://0.0.0.0:4000")
|
||||||
|
|
||||||
response = openai_client_2.chat.completions.with_raw_response.create(
|
response = openai_client_2.chat.completions.with_raw_response.create(
|
||||||
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Headers for call - {}".format(response.headers))
|
print("Headers for call 2 - {}".format(response.headers))
|
||||||
_response = response.parse()
|
_response = response.parse()
|
||||||
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
print("Total tokens for call - {}".format(_response.usage.total_tokens))
|
||||||
# call proxy with key 2 - fails
|
# call proxy with key 2 - fails
|
||||||
|
@ -239,6 +246,10 @@ try:
|
||||||
except RateLimitError as e:
|
except RateLimitError as e:
|
||||||
print("This was rate limited b/c - {}".format(str(e)))
|
print("This was rate limited b/c - {}".format(str(e)))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Response**
|
||||||
|
|
||||||
```
|
```
|
||||||
|
This was rate limited b/c - Error code: 429 - {'error': {'message': {'error': 'Key=<hashed_token> over available TPM=0. Model TPM=0, Active keys=2'}, 'type': 'None', 'param': 'None', 'code': 429}}
|
||||||
|
```
|
|
@ -151,8 +151,8 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429,
|
status_code=429,
|
||||||
detail={
|
detail={
|
||||||
"error": "Team={} over available TPM={}. Model TPM={}, Active teams={}".format(
|
"error": "Key={} over available TPM={}. Model TPM={}, Active keys={}".format(
|
||||||
user_api_key_dict.team_id,
|
user_api_key_dict.api_key,
|
||||||
available_tpm,
|
available_tpm,
|
||||||
model_tpm,
|
model_tpm,
|
||||||
active_projects,
|
active_projects,
|
||||||
|
@ -164,7 +164,7 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.internal_usage_cache.async_set_cache_sadd( # this is a set
|
self.internal_usage_cache.async_set_cache_sadd( # this is a set
|
||||||
model=data["model"], # type: ignore
|
model=data["model"], # type: ignore
|
||||||
value=[user_api_key_dict.team_id or "default_team"],
|
value=[user_api_key_dict.token or "default_key"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue