LiteLLM Minor Fixes & Improvements (11/26/2024) (#6913)

* docs(config_settings.md): document all router_settings * ci(config.yml): add router_settings doc test to ci/cd * test: debug test on ci/cd * test: debug ci/cd test * test: fix test * fix(team_endpoints.py): skip invalid team object. don't fail `/team/list` call Causes downstream errors if ui just fails to load team list * test(base_llm_unit_tests.py): add 'response_format={"type": "text"}' test to base_llm_unit_tests adds complete coverage for all 'response_format' values to ci/cd * feat(router.py): support wildcard routes in `get_router_model_info()` Addresses https://github.com/BerriAI/litellm/issues/6914 * build(model_prices_and_context_window.json): add tpm/rpm limits for all gemini models Allows for ratelimit tracking for gemini models even with wildcard routing enabled Addresses https://github.com/BerriAI/litellm/issues/6914 * feat(router.py): add tpm/rpm tracking on success/failure to global_router Addresses https://github.com/BerriAI/litellm/issues/6914 * feat(router.py): support wildcard routes on router.get_model_group_usage() * fix(router.py): fix linting error * fix(router.py): implement get_remaining_tokens_and_requests Addresses https://github.com/BerriAI/litellm/issues/6914 * fix(router.py): fix linting errors * test: fix test * test: fix tests * docs(config_settings.md): add missing dd env vars to docs * fix(router.py): check if hidden params is dict
2025-04-26 03:04:13 +00:00 · 2024-11-28 00:01:38 +05:30 · 2024-11-28 00:01:38 +05:30 · 2d2931a215
commit 2d2931a215
parent 5d13302e6b
22 changed files with 878 additions and 131 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -41,6 +41,7 @@ from typing import (
 import httpx
 import openai
 from openai import AsyncOpenAI
+from pydantic import BaseModel
 from typing_extensions import overload

 import litellm
@ -122,6 +123,7 @@ from litellm.types.router import (
    ModelInfo,
    ProviderBudgetConfigType,
    RetryPolicy,
+    RouterCacheEnum,
    RouterErrors,
    RouterGeneralSettings,
    RouterModelGroupAliasItem,
@ -239,7 +241,6 @@ class Router:
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based
        provider_budget_config: Optional[ProviderBudgetConfigType] = None,
-        semaphore: Optional[asyncio.Semaphore] = None,
        alerting_config: Optional[AlertingConfig] = None,
        router_general_settings: Optional[
            RouterGeneralSettings
@ -315,8 +316,6 @@ class Router:

        from litellm._service_logger import ServiceLogging

-        if semaphore:
-            self.semaphore = semaphore
        self.set_verbose = set_verbose
        self.debug_level = debug_level
        self.enable_pre_call_checks = enable_pre_call_checks
@ -506,6 +505,14 @@ class Router:
            litellm.success_callback.append(self.sync_deployment_callback_on_success)
        else:
            litellm.success_callback = [self.sync_deployment_callback_on_success]
+        if isinstance(litellm._async_failure_callback, list):
+            litellm._async_failure_callback.append(
+                self.async_deployment_callback_on_failure
+            )
+        else:
+            litellm._async_failure_callback = [
+                self.async_deployment_callback_on_failure
+            ]
        ## COOLDOWNS ##
        if isinstance(litellm.failure_callback, list):
            litellm.failure_callback.append(self.deployment_callback_on_failure)
@ -3291,13 +3298,14 @@ class Router:
    ):
        """
        Track remaining tpm/rpm quota for model in model_list
-
-        Currently, only updates TPM usage.
        """
        try:
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
+                deployment_name = kwargs["litellm_params"]["metadata"].get(
+                    "deployment", None
+                )  # stable name - works for wildcard routes as well
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )
@ -3308,6 +3316,8 @@ class Router:
                elif isinstance(id, int):
                    id = str(id)

+                parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
+
                _usage_obj = completion_response.get("usage")
                total_tokens = _usage_obj.get("total_tokens", 0) if _usage_obj else 0

@ -3319,13 +3329,14 @@ class Router:
                    "%H-%M"
                )  # use the same timezone regardless of system clock

-                tpm_key = f"global_router:{id}:tpm:{current_minute}"
+                tpm_key = RouterCacheEnum.TPM.value.format(
+                    id=id, current_minute=current_minute, model=deployment_name
+                )
                # ------------
                # Update usage
                # ------------
                # update cache

-                parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
                ## TPM
                await self.cache.async_increment_cache(
                    key=tpm_key,
@ -3334,6 +3345,17 @@ class Router:
                    ttl=RoutingArgs.ttl.value,
                )

+                ## RPM
+                rpm_key = RouterCacheEnum.RPM.value.format(
+                    id=id, current_minute=current_minute, model=deployment_name
+                )
+                await self.cache.async_increment_cache(
+                    key=rpm_key,
+                    value=1,
+                    parent_otel_span=parent_otel_span,
+                    ttl=RoutingArgs.ttl.value,
+                )
+
                increment_deployment_successes_for_current_minute(
                    litellm_router_instance=self,
                    deployment_id=id,
@ -3446,6 +3468,40 @@ class Router:
        except Exception as e:
            raise e

+    async def async_deployment_callback_on_failure(
+        self, kwargs, completion_response: Optional[Any], start_time, end_time
+    ):
+        """
+        Update RPM usage for a deployment
+        """
+        deployment_name = kwargs["litellm_params"]["metadata"].get(
+            "deployment", None
+        )  # handles wildcard routes - by giving the original name sent to `litellm.completion`
+        model_group = kwargs["litellm_params"]["metadata"].get("model_group", None)
+        model_info = kwargs["litellm_params"].get("model_info", {}) or {}
+        id = model_info.get("id", None)
+        if model_group is None or id is None:
+            return
+        elif isinstance(id, int):
+            id = str(id)
+        parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
+
+        dt = get_utc_datetime()
+        current_minute = dt.strftime(
+            "%H-%M"
+        )  # use the same timezone regardless of system clock
+
+        ## RPM
+        rpm_key = RouterCacheEnum.RPM.value.format(
+            id=id, current_minute=current_minute, model=deployment_name
+        )
+        await self.cache.async_increment_cache(
+            key=rpm_key,
+            value=1,
+            parent_otel_span=parent_otel_span,
+            ttl=RoutingArgs.ttl.value,
+        )
+
    def log_retry(self, kwargs: dict, e: Exception) -> dict:
        """
        When a retry or fallback happens, log the details of the just failed model call - similar to Sentry breadcrumbing
@ -4123,7 +4179,24 @@ class Router:
                    raise Exception("Model Name invalid - {}".format(type(model)))
        return None

-    def get_router_model_info(self, deployment: dict) -> ModelMapInfo:
+    @overload
+    def get_router_model_info(
+        self, deployment: dict, received_model_name: str, id: None = None
+    ) -> ModelMapInfo:
+        pass
+
+    @overload
+    def get_router_model_info(
+        self, deployment: None, received_model_name: str, id: str
+    ) -> ModelMapInfo:
+        pass
+
+    def get_router_model_info(
+        self,
+        deployment: Optional[dict],
+        received_model_name: str,
+        id: Optional[str] = None,
+    ) -> ModelMapInfo:
        """
        For a given model id, return the model info (max tokens, input cost, output cost, etc.).

@ -4137,6 +4210,14 @@ class Router:
        Raises:
        - ValueError -> If model is not mapped yet
        """
+        if id is not None:
+            _deployment = self.get_deployment(model_id=id)
+            if _deployment is not None:
+                deployment = _deployment.model_dump(exclude_none=True)
+
+        if deployment is None:
+            raise ValueError("Deployment not found")
+
        ## GET BASE MODEL
        base_model = deployment.get("model_info", {}).get("base_model", None)
        if base_model is None:
@ -4158,10 +4239,27 @@ class Router:
        elif custom_llm_provider != "azure":
            model = _model

+            potential_models = self.pattern_router.route(received_model_name)
+            if "*" in model and potential_models is not None:  # if wildcard route
+                for potential_model in potential_models:
+                    try:
+                        if potential_model.get("model_info", {}).get(
+                            "id"
+                        ) == deployment.get("model_info", {}).get("id"):
+                            model = potential_model.get("litellm_params", {}).get(
+                                "model"
+                            )
+                            break
+                    except Exception:
+                        pass
+
        ## GET LITELLM MODEL INFO - raises exception, if model is not mapped
-        model_info = litellm.get_model_info(
-            model="{}/{}".format(custom_llm_provider, model)
-        )
+        if not model.startswith(custom_llm_provider):
+            model_info_name = "{}/{}".format(custom_llm_provider, model)
+        else:
+            model_info_name = model
+
+        model_info = litellm.get_model_info(model=model_info_name)

        ## CHECK USER SET MODEL INFO
        user_model_info = deployment.get("model_info", {})
@ -4211,8 +4309,10 @@ class Router:
        total_tpm: Optional[int] = None
        total_rpm: Optional[int] = None
        configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
-
-        for model in self.model_list:
+        model_list = self.get_model_list(model_name=model_group)
+        if model_list is None:
+            return None
+        for model in model_list:
            is_match = False
            if (
                "model_name" in model and model["model_name"] == model_group
@ -4227,7 +4327,7 @@ class Router:
            if not is_match:
                continue
            # model in model group found #
-            litellm_params = LiteLLM_Params(**model["litellm_params"])
+            litellm_params = LiteLLM_Params(**model["litellm_params"])  # type: ignore
            # get configurable clientside auth params
            configurable_clientside_auth_params = (
                litellm_params.configurable_clientside_auth_params
@ -4235,38 +4335,30 @@ class Router:
            # get model tpm
            _deployment_tpm: Optional[int] = None
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("tpm", None)
+                _deployment_tpm = model.get("tpm", None)  # type: ignore
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)
+                _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)  # type: ignore
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("model_info", {}).get("tpm", None)
+                _deployment_tpm = model.get("model_info", {}).get("tpm", None)  # type: ignore

-            if _deployment_tpm is not None:
-                if total_tpm is None:
-                    total_tpm = 0
-                total_tpm += _deployment_tpm  # type: ignore
            # get model rpm
            _deployment_rpm: Optional[int] = None
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("rpm", None)
+                _deployment_rpm = model.get("rpm", None)  # type: ignore
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)
+                _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)  # type: ignore
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("model_info", {}).get("rpm", None)
+                _deployment_rpm = model.get("model_info", {}).get("rpm", None)  # type: ignore

-            if _deployment_rpm is not None:
-                if total_rpm is None:
-                    total_rpm = 0
-                total_rpm += _deployment_rpm  # type: ignore
            # get model info
            try:
                model_info = litellm.get_model_info(model=litellm_params.model)
            except Exception:
                model_info = None
            # get llm provider
-            model, llm_provider = "", ""
+            litellm_model, llm_provider = "", ""
            try:
-                model, llm_provider, _, _ = litellm.get_llm_provider(
+                litellm_model, llm_provider, _, _ = litellm.get_llm_provider(
                    model=litellm_params.model,
                    custom_llm_provider=litellm_params.custom_llm_provider,
                )
@ -4277,7 +4369,7 @@ class Router:

            if model_info is None:
                supported_openai_params = litellm.get_supported_openai_params(
-                    model=model, custom_llm_provider=llm_provider
+                    model=litellm_model, custom_llm_provider=llm_provider
                )
                if supported_openai_params is None:
                    supported_openai_params = []
@ -4367,7 +4459,20 @@ class Router:
                    model_group_info.supported_openai_params = model_info[
                        "supported_openai_params"
                    ]
+                if model_info.get("tpm", None) is not None and _deployment_tpm is None:
+                    _deployment_tpm = model_info.get("tpm")
+                if model_info.get("rpm", None) is not None and _deployment_rpm is None:
+                    _deployment_rpm = model_info.get("rpm")

+            if _deployment_tpm is not None:
+                if total_tpm is None:
+                    total_tpm = 0
+                total_tpm += _deployment_tpm  # type: ignore
+
+            if _deployment_rpm is not None:
+                if total_rpm is None:
+                    total_rpm = 0
+                total_rpm += _deployment_rpm  # type: ignore
        if model_group_info is not None:
            ## UPDATE WITH TOTAL TPM/RPM FOR MODEL GROUP
            if total_tpm is not None:
@ -4419,7 +4524,10 @@ class Router:
        self, model_group: str
    ) -> Tuple[Optional[int], Optional[int]]:
        """
-        Returns remaining tpm/rpm quota for model group
+        Returns current tpm/rpm usage for model group
+
+        Parameters:
+        - model_group: str - the received model name from the user (can be a wildcard route).

        Returns:
        - usage: Tuple[tpm, rpm]
@ -4430,20 +4538,37 @@ class Router:
        )  # use the same timezone regardless of system clock
        tpm_keys: List[str] = []
        rpm_keys: List[str] = []
-        for model in self.model_list:
-            if "model_name" in model and model["model_name"] == model_group:
-                tpm_keys.append(
-                    f"global_router:{model['model_info']['id']}:tpm:{current_minute}"
+
+        model_list = self.get_model_list(model_name=model_group)
+        if model_list is None:  # no matching deployments
+            return None, None
+
+        for model in model_list:
+            id: Optional[str] = model.get("model_info", {}).get("id")  # type: ignore
+            litellm_model: Optional[str] = model["litellm_params"].get(
+                "model"
+            )  # USE THE MODEL SENT TO litellm.completion() - consistent with how global_router cache is written.
+            if id is None or litellm_model is None:
+                continue
+            tpm_keys.append(
+                RouterCacheEnum.TPM.value.format(
+                    id=id,
+                    model=litellm_model,
+                    current_minute=current_minute,
                )
-                rpm_keys.append(
-                    f"global_router:{model['model_info']['id']}:rpm:{current_minute}"
+            )
+            rpm_keys.append(
+                RouterCacheEnum.RPM.value.format(
+                    id=id,
+                    model=litellm_model,
+                    current_minute=current_minute,
                )
+            )
        combined_tpm_rpm_keys = tpm_keys + rpm_keys

        combined_tpm_rpm_values = await self.cache.async_batch_get_cache(
            keys=combined_tpm_rpm_keys
        )
-
        if combined_tpm_rpm_values is None:
            return None, None

@ -4468,6 +4593,32 @@ class Router:
                    rpm_usage += t
        return tpm_usage, rpm_usage

+    async def get_remaining_model_group_usage(self, model_group: str) -> Dict[str, int]:
+
+        current_tpm, current_rpm = await self.get_model_group_usage(model_group)
+
+        model_group_info = self.get_model_group_info(model_group)
+
+        if model_group_info is not None and model_group_info.tpm is not None:
+            tpm_limit = model_group_info.tpm
+        else:
+            tpm_limit = None
+
+        if model_group_info is not None and model_group_info.rpm is not None:
+            rpm_limit = model_group_info.rpm
+        else:
+            rpm_limit = None
+
+        returned_dict = {}
+        if tpm_limit is not None and current_tpm is not None:
+            returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - current_tpm
+            returned_dict["x-ratelimit-limit-tokens"] = tpm_limit
+        if rpm_limit is not None and current_rpm is not None:
+            returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - current_rpm
+            returned_dict["x-ratelimit-limit-requests"] = rpm_limit
+
+        return returned_dict
+
    async def set_response_headers(
        self, response: Any, model_group: Optional[str] = None
    ) -> Any:
@ -4478,6 +4629,30 @@ class Router:
        # - if healthy_deployments > 1, return model group rate limit headers
        # - else return the model's rate limit headers
        """
+        if (
+            isinstance(response, BaseModel)
+            and hasattr(response, "_hidden_params")
+            and isinstance(response._hidden_params, dict)  # type: ignore
+        ):
+            response._hidden_params.setdefault("additional_headers", {})  # type: ignore
+            response._hidden_params["additional_headers"][  # type: ignore
+                "x-litellm-model-group"
+            ] = model_group
+
+            additional_headers = response._hidden_params["additional_headers"]  # type: ignore
+
+            if (
+                "x-ratelimit-remaining-tokens" not in additional_headers
+                and "x-ratelimit-remaining-requests" not in additional_headers
+                and model_group is not None
+            ):
+                remaining_usage = await self.get_remaining_model_group_usage(
+                    model_group
+                )
+
+                for header, value in remaining_usage.items():
+                    if value is not None:
+                        additional_headers[header] = value
        return response

    def get_model_ids(self, model_name: Optional[str] = None) -> List[str]:
@ -4560,6 +4735,13 @@ class Router:
                        )
                    )

+            if len(returned_models) == 0:  # check if wildcard route
+                potential_wildcard_models = self.pattern_router.route(model_name)
+                if potential_wildcard_models is not None:
+                    returned_models.extend(
+                        [DeploymentTypedDict(**m) for m in potential_wildcard_models]  # type: ignore
+                    )
+
            if model_name is None:
                returned_models += self.model_list

@ -4810,10 +4992,12 @@ class Router:
                    base_model = deployment.get("litellm_params", {}).get(
                        "base_model", None
                    )
+                model_info = self.get_router_model_info(
+                    deployment=deployment, received_model_name=model
+                )
                model = base_model or deployment.get("litellm_params", {}).get(
                    "model", None
                )
-                model_info = self.get_router_model_info(deployment=deployment)

                if (
                    isinstance(model_info, dict)