litellm/litellm/router_strategy/lowest_latency.py

#### What this does ####
#   picks based on response time (for streaming, this is time to first token)
import random
import traceback
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Union

from pydantic import BaseModel

import litellm
from litellm import ModelResponse, token_counter, verbose_logger
from litellm.caching.caching import DualCache
from litellm.integrations.custom_logger import CustomLogger


class LiteLLMBase(BaseModel):
    """
    Implements default functions, all pydantic objects should have.
    """

    def json(self, **kwargs):  # type: ignore
        try:
            return self.model_dump()  # noqa
        except Exception:
            # if using pydantic v1
            return self.dict()


class RoutingArgs(LiteLLMBase):
    ttl: float = 1 * 60 * 60  # 1 hour
    lowest_latency_buffer: float = 0
    max_latency_list_size: int = 10


class LowestLatencyLoggingHandler(CustomLogger):
    test_flag: bool = False
    logged_success: int = 0
    logged_failure: int = 0

    def __init__(
        self, router_cache: DualCache, model_list: list, routing_args: dict = {}
    ):
        self.router_cache = router_cache
        self.model_list = model_list
        self.routing_args = RoutingArgs(**routing_args)

    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            """
            Update latency usage on success
            """
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )

                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                if model_group is None or id is None:
                    return
                elif isinstance(id, int):
                    id = str(id)

                # ------------
                # Setup values
                # ------------
                """
                {
                    {model_group}_map: {
                        id: {
                            "latency": [..]
                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                        }
                    }
                }
                """
                latency_key = f"{model_group}_map"

                current_date = datetime.now().strftime("%Y-%m-%d")
                current_hour = datetime.now().strftime("%H")
                current_minute = datetime.now().strftime("%M")
                precise_minute = f"{current_date}-{current_hour}-{current_minute}"

                response_ms: timedelta = end_time - start_time
                time_to_first_token_response_time: Optional[timedelta] = None

                if kwargs.get("stream", None) is not None and kwargs["stream"] is True:
                    # only log ttft for streaming request
                    time_to_first_token_response_time = (
                        kwargs.get("completion_start_time", end_time) - start_time
                    )

                final_value: Union[float, timedelta] = response_ms
                time_to_first_token: Optional[float] = None
                total_tokens = 0

                if isinstance(response_obj, ModelResponse):
                    _usage = getattr(response_obj, "usage", None)
                    if _usage is not None:
                        completion_tokens = _usage.completion_tokens
                        total_tokens = _usage.total_tokens
                        final_value = float(
                            response_ms.total_seconds() / completion_tokens
                        )

                        if time_to_first_token_response_time is not None:
                            time_to_first_token = float(
                                time_to_first_token_response_time.total_seconds()
                                / completion_tokens
                            )

                # ------------
                # Update usage
                # ------------

                request_count_dict = self.router_cache.get_cache(key=latency_key) or {}

                if id not in request_count_dict:
                    request_count_dict[id] = {}

                ## Latency
                if (
                    len(request_count_dict[id].get("latency", []))
                    < self.routing_args.max_latency_list_size
                ):
                    request_count_dict[id].setdefault("latency", []).append(final_value)
                else:
                    request_count_dict[id]["latency"] = request_count_dict[id][
                        "latency"
                    ][: self.routing_args.max_latency_list_size - 1] + [final_value]

                ## Time to first token
                if time_to_first_token is not None:
                    if (
                        len(request_count_dict[id].get("time_to_first_token", []))
                        < self.routing_args.max_latency_list_size
                    ):
                        request_count_dict[id].setdefault(
                            "time_to_first_token", []
                        ).append(time_to_first_token)
                    else:
                        request_count_dict[id][
                            "time_to_first_token"
                        ] = request_count_dict[id]["time_to_first_token"][
                            : self.routing_args.max_latency_list_size - 1
                        ] + [
                            time_to_first_token
                        ]

                if precise_minute not in request_count_dict[id]:
                    request_count_dict[id][precise_minute] = {}

                ## TPM
                request_count_dict[id][precise_minute]["tpm"] = (
                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
                )

                ## RPM
                request_count_dict[id][precise_minute]["rpm"] = (
                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
                )

                self.router_cache.set_cache(
                    key=latency_key, value=request_count_dict, ttl=self.routing_args.ttl
                )  # reset map within window

                ### TESTING ###
                if self.test_flag:
                    self.logged_success += 1
        except Exception as e:
            verbose_logger.exception(
                "litellm.proxy.hooks.prompt_injection_detection.py::async_pre_call_hook(): Exception occured - {}".format(
                    str(e)
                )
            )
            pass

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        """
        Check if Timeout Error, if timeout set deployment latency -> 100
        """
        try:
            _exception = kwargs.get("exception", None)
            if isinstance(_exception, litellm.Timeout):
                if kwargs["litellm_params"].get("metadata") is None:
                    pass
                else:
                    model_group = kwargs["litellm_params"]["metadata"].get(
                        "model_group", None
                    )

                    id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                    if model_group is None or id is None:
                        return
                    elif isinstance(id, int):
                        id = str(id)

                    # ------------
                    # Setup values
                    # ------------
                    """
                    {
                        {model_group}_map: {
                            id: {
                                "latency": [..]
                                f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                            }
                        }
                    }
                    """
                    latency_key = f"{model_group}_map"
                    request_count_dict = (
                        self.router_cache.get_cache(key=latency_key) or {}
                    )

                    if id not in request_count_dict:
                        request_count_dict[id] = {}

                    ## Latency - give 1000s penalty for failing
                    if (
                        len(request_count_dict[id].get("latency", []))
                        < self.routing_args.max_latency_list_size
                    ):
                        request_count_dict[id].setdefault("latency", []).append(1000.0)
                    else:
                        request_count_dict[id]["latency"] = request_count_dict[id][
                            "latency"
                        ][: self.routing_args.max_latency_list_size - 1] + [1000.0]

                    self.router_cache.set_cache(
                        key=latency_key,
                        value=request_count_dict,
                        ttl=self.routing_args.ttl,
                    )  # reset map within window
            else:
                # do nothing if it's not a timeout error
                return
        except Exception as e:
            verbose_logger.exception(
                "litellm.proxy.hooks.prompt_injection_detection.py::async_pre_call_hook(): Exception occured - {}".format(
                    str(e)
                )
            )
            pass

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            """
            Update latency usage on success
            """
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )

                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                if model_group is None or id is None:
                    return
                elif isinstance(id, int):
                    id = str(id)

                # ------------
                # Setup values
                # ------------
                """
                {
                    {model_group}_map: {
                        id: {
                            "latency": [..]
                            "time_to_first_token": [..]
                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                        }
                    }
                }
                """
                latency_key = f"{model_group}_map"

                current_date = datetime.now().strftime("%Y-%m-%d")
                current_hour = datetime.now().strftime("%H")
                current_minute = datetime.now().strftime("%M")
                precise_minute = f"{current_date}-{current_hour}-{current_minute}"

                response_ms: timedelta = end_time - start_time
                time_to_first_token_response_time: Optional[timedelta] = None
                if kwargs.get("stream", None) is not None and kwargs["stream"] is True:
                    # only log ttft for streaming request
                    time_to_first_token_response_time = (
                        kwargs.get("completion_start_time", end_time) - start_time
                    )

                final_value: Union[float, timedelta] = response_ms
                total_tokens = 0
                time_to_first_token: Optional[float] = None

                if isinstance(response_obj, ModelResponse):
                    _usage = getattr(response_obj, "usage", None)
                    if _usage is not None:
                        completion_tokens = _usage.completion_tokens
                        total_tokens = _usage.total_tokens
                        final_value = float(
                            response_ms.total_seconds() / completion_tokens
                        )

                        if time_to_first_token_response_time is not None:
                            time_to_first_token = float(
                                time_to_first_token_response_time.total_seconds()
                                / completion_tokens
                            )
                # ------------
                # Update usage
                # ------------

                request_count_dict = self.router_cache.get_cache(key=latency_key) or {}

                if id not in request_count_dict:
                    request_count_dict[id] = {}

                ## Latency
                if (
                    len(request_count_dict[id].get("latency", []))
                    < self.routing_args.max_latency_list_size
                ):
                    request_count_dict[id].setdefault("latency", []).append(final_value)
                else:
                    request_count_dict[id]["latency"] = request_count_dict[id][
                        "latency"
                    ][: self.routing_args.max_latency_list_size - 1] + [final_value]

                ## Time to first token
                if time_to_first_token is not None:
                    if (
                        len(request_count_dict[id].get("time_to_first_token", []))
                        < self.routing_args.max_latency_list_size
                    ):
                        request_count_dict[id].setdefault(
                            "time_to_first_token", []
                        ).append(time_to_first_token)
                    else:
                        request_count_dict[id][
                            "time_to_first_token"
                        ] = request_count_dict[id]["time_to_first_token"][
                            : self.routing_args.max_latency_list_size - 1
                        ] + [
                            time_to_first_token
                        ]

                if precise_minute not in request_count_dict[id]:
                    request_count_dict[id][precise_minute] = {}

                ## TPM
                request_count_dict[id][precise_minute]["tpm"] = (
                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
                )

                ## RPM
                request_count_dict[id][precise_minute]["rpm"] = (
                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
                )

                self.router_cache.set_cache(
                    key=latency_key, value=request_count_dict, ttl=self.routing_args.ttl
                )  # reset map within window

                ### TESTING ###
                if self.test_flag:
                    self.logged_success += 1
        except Exception as e:
            verbose_logger.exception(
                "litellm.router_strategy.lowest_latency.py::async_log_success_event(): Exception occured - {}".format(
                    str(e)
                )
            )
            pass

    def get_available_deployments(
        self,
        model_group: str,
        healthy_deployments: list,
        messages: Optional[List[Dict[str, str]]] = None,
        input: Optional[Union[str, List]] = None,
        request_kwargs: Optional[Dict] = None,
    ):
        """
        Returns a deployment with the lowest latency
        """
        # get list of potential deployments
        latency_key = f"{model_group}_map"
        _latency_per_deployment = {}

        request_count_dict = self.router_cache.get_cache(key=latency_key) or {}

        # -----------------------
        # Find lowest used model
        # ----------------------
        lowest_latency = float("inf")

        current_date = datetime.now().strftime("%Y-%m-%d")
        current_hour = datetime.now().strftime("%H")
        current_minute = datetime.now().strftime("%M")
        precise_minute = f"{current_date}-{current_hour}-{current_minute}"

        deployment = None

        if request_count_dict is None:  # base case
            return

        all_deployments = request_count_dict
        for d in healthy_deployments:
            ## if healthy deployment not yet used
            if d["model_info"]["id"] not in all_deployments:
                all_deployments[d["model_info"]["id"]] = {
                    "latency": [0],
                    precise_minute: {"tpm": 0, "rpm": 0},
                }

        try:
            input_tokens = token_counter(messages=messages, text=input)
        except Exception:
            input_tokens = 0

        # randomly sample from all_deployments, incase all deployments have latency=0.0
        _items = all_deployments.items()

        all_deployments = random.sample(list(_items), len(_items))
        all_deployments = dict(all_deployments)
        ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits

        potential_deployments = []
        for item, item_map in all_deployments.items():
            ## get the item from model list
            _deployment = None
            for m in healthy_deployments:
                if item == m["model_info"]["id"]:
                    _deployment = m

            if _deployment is None:
                continue  # skip to next one

            _deployment_tpm = (
                _deployment.get("tpm", None)
                or _deployment.get("litellm_params", {}).get("tpm", None)
                or _deployment.get("model_info", {}).get("tpm", None)
                or float("inf")
            )

            _deployment_rpm = (
                _deployment.get("rpm", None)
                or _deployment.get("litellm_params", {}).get("rpm", None)
                or _deployment.get("model_info", {}).get("rpm", None)
                or float("inf")
            )
            item_latency = item_map.get("latency", [])
            item_ttft_latency = item_map.get("time_to_first_token", [])
            item_rpm = item_map.get(precise_minute, {}).get("rpm", 0)
            item_tpm = item_map.get(precise_minute, {}).get("tpm", 0)

            # get average latency or average ttft (depending on streaming/non-streaming)
            total: float = 0.0
            if (
                request_kwargs is not None
                and request_kwargs.get("stream", None) is not None
                and request_kwargs["stream"] is True
                and len(item_ttft_latency) > 0
            ):
                for _call_latency in item_ttft_latency:
                    if isinstance(_call_latency, float):
                        total += _call_latency
            else:
                for _call_latency in item_latency:
                    if isinstance(_call_latency, float):
                        total += _call_latency
            item_latency = total / len(item_latency)

            # -------------- #
            # Debugging Logic
            # -------------- #
            # We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
            # this helps a user to debug why the router picked a specfic deployment      #
            _deployment_api_base = _deployment.get("litellm_params", {}).get(
                "api_base", ""
            )
            if _deployment_api_base is not None:
                _latency_per_deployment[_deployment_api_base] = item_latency
            # -------------- #
            # End of Debugging Logic
            # -------------- #

            if (
                item_tpm + input_tokens > _deployment_tpm
                or item_rpm + 1 > _deployment_rpm
            ):  # if user passed in tpm / rpm in the model_list
                continue
            else:
                potential_deployments.append((_deployment, item_latency))

        if len(potential_deployments) == 0:
            return None

        # Sort potential deployments by latency
        sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])

        # Find lowest latency deployment
        lowest_latency = sorted_deployments[0][1]

        # Find deployments within buffer of lowest latency
        buffer = self.routing_args.lowest_latency_buffer * lowest_latency

        valid_deployments = [
            x for x in sorted_deployments if x[1] <= lowest_latency + buffer
        ]

        # Pick a random deployment from valid deployments
        random_valid_deployment = random.choice(valid_deployments)
        deployment = random_valid_deployment[0]

        if request_kwargs is not None and "metadata" in request_kwargs:
            request_kwargs["metadata"][
                "_latency_per_deployment"
            ] = _latency_per_deployment
        return deployment