(litellm sdk speedup router) - adds a helper _cached_get_model_group_info to use when trying to get deployment tpm/rpm limits (#7719)

* fix _cached_get_model_group_info

* fixes get_remaining_model_group_usage

* test_cached_get_model_group_info
This commit is contained in:
Ishaan Jaff 2025-01-12 15:14:54 -08:00 committed by GitHub
parent bb4b9b8fbf
commit b4a99afee3
2 changed files with 36 additions and 4 deletions

View file

@ -19,6 +19,7 @@ import time
import traceback
import uuid
from collections import defaultdict
from functools import lru_cache
from typing import (
TYPE_CHECKING,
Any,
@ -4696,11 +4697,19 @@ class Router:
rpm_usage += t
return tpm_usage, rpm_usage
@lru_cache(maxsize=64)
def _cached_get_model_group_info(
self, model_group: str
) -> Optional[ModelGroupInfo]:
"""
Cached version of get_model_group_info, uses @lru_cache wrapper
This is a speed optimization, since set_response_headers makes a call to get_model_group_info on every request
"""
return self.get_model_group_info(model_group)
async def get_remaining_model_group_usage(self, model_group: str) -> Dict[str, int]:
current_tpm, current_rpm = await self.get_model_group_usage(model_group)
model_group_info = self.get_model_group_info(model_group)
model_group_info = self._cached_get_model_group_info(model_group)
if model_group_info is not None and model_group_info.tpm is not None:
tpm_limit = model_group_info.tpm
@ -4712,6 +4721,11 @@ class Router:
else:
rpm_limit = None
if tpm_limit is None and rpm_limit is None:
return {}
current_tpm, current_rpm = await self.get_model_group_usage(model_group)
returned_dict = {}
if tpm_limit is not None:
returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - (