diff --git a/docs/my-website/docs/proxy/team_budgets.md b/docs/my-website/docs/proxy/team_budgets.md index f6b7c22824..9ab0c07866 100644 --- a/docs/my-website/docs/proxy/team_budgets.md +++ b/docs/my-website/docs/proxy/team_budgets.md @@ -154,7 +154,9 @@ litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e- ### Dynamic TPM Allocation -Prevent teams from gobbling too much quota. +Prevent projects from gobbling too much quota. + +Dynamically allocate TPM quota to api keys, based on active keys in that minute. 1. Setup config.yaml @@ -190,6 +192,12 @@ litellm --config /path/to/config.yaml - Mock response returns 30 total tokens / request - Each team will only be able to make 1 request per minute """ +""" +- Run 2 concurrent teams calling same model +- model has 60 TPM +- Mock response returns 30 total tokens / request +- Each team will only be able to make 1 request per minute +""" import requests from openai import OpenAI, RateLimitError @@ -204,7 +212,6 @@ def create_key(api_key: str, base_url: str): _response = response.json() - print(f"_response: {_response}") return _response["key"] key_1 = create_key(api_key="sk-1234", base_url="http://0.0.0.0:4000") @@ -217,19 +224,19 @@ response = openai_client_1.chat.completions.with_raw_response.create( model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}], ) -print("Headers for call - {}".format(response.headers)) +print("Headers for call 1 - {}".format(response.headers)) _response = response.parse() print("Total tokens for call - {}".format(_response.usage.total_tokens)) # call proxy with key 2 - works -openai_client_2 = OpenAI(api_key=key_1, base_url="http://0.0.0.0:4000") +openai_client_2 = OpenAI(api_key=key_2, base_url="http://0.0.0.0:4000") response = openai_client_2.chat.completions.with_raw_response.create( model="my-fake-model", messages=[{"role": "user", "content": "Hello world!"}], ) -print("Headers for call - {}".format(response.headers)) +print("Headers for call 2 - {}".format(response.headers)) _response = response.parse() print("Total tokens for call - {}".format(_response.usage.total_tokens)) # call proxy with key 2 - fails @@ -239,6 +246,10 @@ try: except RateLimitError as e: print("This was rate limited b/c - {}".format(str(e))) +``` +**Expected Response** +``` +This was rate limited b/c - Error code: 429 - {'error': {'message': {'error': 'Key= over available TPM=0. Model TPM=0, Active keys=2'}, 'type': 'None', 'param': 'None', 'code': 429}} ``` \ No newline at end of file diff --git a/litellm/proxy/hooks/dynamic_rate_limiter.py b/litellm/proxy/hooks/dynamic_rate_limiter.py index 8b132ff9ee..95f0ccc13e 100644 --- a/litellm/proxy/hooks/dynamic_rate_limiter.py +++ b/litellm/proxy/hooks/dynamic_rate_limiter.py @@ -151,8 +151,8 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger): raise HTTPException( status_code=429, detail={ - "error": "Team={} over available TPM={}. Model TPM={}, Active teams={}".format( - user_api_key_dict.team_id, + "error": "Key={} over available TPM={}. Model TPM={}, Active keys={}".format( + user_api_key_dict.api_key, available_tpm, model_tpm, active_projects, @@ -164,7 +164,7 @@ class _PROXY_DynamicRateLimitHandler(CustomLogger): asyncio.create_task( self.internal_usage_cache.async_set_cache_sadd( # this is a set model=data["model"], # type: ignore - value=[user_api_key_dict.team_id or "default_team"], + value=[user_api_key_dict.token or "default_key"], ) ) return None