Merge pull request #3192 from BerriAI/litellm_calculate_max_parallel_requests

fix(router.py): Make TPM limits concurrency-safe
This commit is contained in:
Krish Dholakia 2024-04-20 13:24:29 -07:00 committed by GitHub
commit fcde3ba213
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 208 additions and 19 deletions

View file

@ -5395,6 +5395,49 @@ def get_optional_params(
return optional_params
def calculate_max_parallel_requests(
max_parallel_requests: Optional[int],
rpm: Optional[int],
tpm: Optional[int],
default_max_parallel_requests: Optional[int],
) -> Optional[int]:
"""
Returns the max parallel requests to send to a deployment.
Used in semaphore for async requests on router.
Parameters:
- max_parallel_requests - Optional[int] - max_parallel_requests allowed for that deployment
- rpm - Optional[int] - requests per minute allowed for that deployment
- tpm - Optional[int] - tokens per minute allowed for that deployment
- default_max_parallel_requests - Optional[int] - default_max_parallel_requests allowed for any deployment
Returns:
- int or None (if all params are None)
Order:
max_parallel_requests > rpm > tpm / 6 (azure formula) > default max_parallel_requests
Azure RPM formula:
6 rpm per 1000 TPM
https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits
"""
if max_parallel_requests is not None:
return max_parallel_requests
elif rpm is not None:
return rpm
elif tpm is not None:
calculated_rpm = int(tpm / 1000 / 6)
if calculated_rpm == 0:
calculated_rpm = 1
return calculated_rpm
elif default_max_parallel_requests is not None:
return default_max_parallel_requests
return None
def get_api_base(model: str, optional_params: dict) -> Optional[str]:
"""
Returns the api base used for calling the model.