docs(team_budgets.md): fix script

/
This commit is contained in:
Krrish Dholakia 2024-06-22 15:42:05 -07:00
parent fc4e900a23
commit bae7377128
2 changed files with 19 additions and 8 deletions

View file

@ -154,7 +154,9 @@ litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-
### Dynamic TPM Allocation ### Dynamic TPM Allocation
Prevent teams from gobbling too much quota. Prevent projects from gobbling too much quota.
Dynamically allocate TPM quota to api keys, based on active keys in that minute.
1. Setup config.yaml 1. Setup config.yaml
@ -190,6 +192,12 @@ litellm --config /path/to/config.yaml
- Mock response returns 30 total tokens / request - Mock response returns 30 total tokens / request
- Each team will only be able to make 1 request per minute - Each team will only be able to make 1 request per minute
""" """
"""
- Run 2 concurrent teams calling same model
- model has 60 TPM
- Mock response returns 30 total tokens / request
- Each team will only be able to make 1 request per minute
"""
import requests import requests
from openai import OpenAI, RateLimitError from openai import OpenAI, RateLimitError
@ -204,7 +212,6 @@ def create_key(api_key: str, base_url: str):
_response = response.json() _response = response.json()
print(f"_response: {_response}")
return _response["key"] return _response["key"]
key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000") key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000")
@ -217,19 +224,19 @@ response = openai_client_1.chat.completions.with_raw_response.create(
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}], model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
) )
print("Headers for call - {}".format(response.headers)) print("Headers for call 1 - {}".format(response.headers))
_response = response.parse() _response = response.parse()
print("Total tokens for call - {}".format(_response.usage.total_tokens)) print("Total tokens for call - {}".format(_response.usage.total_tokens))
# call proxy with key 2 - works # call proxy with key 2 - works
openai_client_2 = OpenAI(api_key=key_1, base_url="http://0.0.0.0:4000") openai_client_2 = OpenAI(api_key=key_2, base_url="http://0.0.0.0:4000")
response = openai_client_2.chat.completions.with_raw_response.create( response = openai_client_2.chat.completions.with_raw_response.create(
model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}], model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}],
) )
print("Headers for call - {}".format(response.headers)) print("Headers for call 2 - {}".format(response.headers))
_response = response.parse() _response = response.parse()
print("Total tokens for call - {}".format(_response.usage.total_tokens)) print("Total tokens for call - {}".format(_response.usage.total_tokens))
# call proxy with key 2 - fails # call proxy with key 2 - fails
@ -239,6 +246,10 @@ try:
except RateLimitError as e: except RateLimitError as e:
print("This was rate limited b/c - {}".format(str(e))) print("This was rate limited b/c - {}".format(str(e)))
```
**Expected Response**
```
This was rate limited b/c - Error code: 429 - {'error': {'message': {'error': 'Key=<hashed_token> over available TPM=0. Model TPM=0, Active keys=2'}, 'type': 'None', 'param': 'None', 'code': 429}}
``` ```

View file

@ -151,8 +151,8 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
raise HTTPException( raise HTTPException(
status_code=429, status_code=429,
detail={ detail={
"error": "Team={} over available TPM={}. Model TPM={}, Active teams={}".format( "error": "Key={} over available TPM={}. Model TPM={}, Active keys={}".format(
user_api_key_dict.team_id, user_api_key_dict.api_key,
available_tpm, available_tpm,
model_tpm, model_tpm,
active_projects, active_projects,
@ -164,7 +164,7 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger):
asyncio.create_task( asyncio.create_task(
self.internal_usage_cache.async_set_cache_sadd( # this is a set self.internal_usage_cache.async_set_cache_sadd( # this is a set
model=data["model"], # type: ignore model=data["model"], # type: ignore
value=[user_api_key_dict.team_id or "default_team"], value=[user_api_key_dict.token or "default_key"],
) )
) )
return None return None