Merge pull request #3283 from BerriAI/litellm_debug_lowest_latency

[Fix] Add better observability for debugging lowest latency routing
2024-04-24 20:42:52 -07:00 · 2024-04-24 20:42:52 -07:00 · 2c7f4695d9
commit 2c7f4695d9
parent b84502c2aa 212369498e
4 changed files with 198 additions and 1 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -139,6 +139,28 @@ class SlackAlerting:
        except Exception as e:
            raise e
    def _get_deployment_latencies_to_alert(self, metadata=None):
        if metadata is None:
            return None
        if "_latency_per_deployment" in metadata:
            # Translate model_id to -> api_base
            # _latency_per_deployment is a dictionary that looks like this:
            """
            _latency_per_deployment: {
                api_base: 0.01336697916666667
            }
            """
            _message_to_send = ""
            _deployment_latencies = metadata["_latency_per_deployment"]
            if len(_deployment_latencies) == 0:
                return None
            for api_base, latency in _deployment_latencies.items():
                _message_to_send += f"\n{api_base}: {round(latency,2)}s"
            _message_to_send = "```" + _message_to_send + "```"
            return _message_to_send
    async def response_taking_too_long_callback(
        self,
        kwargs,  # kwargs to completion
@ -165,6 +187,21 @@ class SlackAlerting:
                request_info = self._add_langfuse_trace_id_to_alert(
                    request_info=request_info, kwargs=kwargs
                )
            # add deployment latencies to alert
            if (
                kwargs is not None
                and "litellm_params" in kwargs
                and "metadata" in kwargs["litellm_params"]
            ):
                _metadata = kwargs["litellm_params"]["metadata"]
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=_metadata
                )
                if _deployment_latency_map is not None:
                    request_info += (
                        f"\nAvailable Deployment Latencies\n{_deployment_latency_map}"
                    )
            await self.send_alert(
                message=slow_message + request_info,
                level="Low",
@ -243,6 +280,14 @@ class SlackAlerting:
                alerting_message = (
                    f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
                )
                # add deployment latencies to alert
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=request_data.get("metadata", {})
                )
                if _deployment_latency_map is not None:
                    request_info += f"\nDeployment Latencies\n{_deployment_latency_map}"
                await self.send_alert(
                    message=alerting_message + request_info,
                    level="Medium",
--- a/litellm/router.py
+++ b/litellm/router.py
@ -454,6 +454,7 @@ class Router:
                model=model,
                messages=messages,
                specific_deployment=kwargs.pop("specific_deployment", None),
                request_kwargs=kwargs,
            )
            # debug how often this deployment picked
@ -2831,6 +2832,7 @@ class Router:
        messages: Optional[List[Dict[str, str]]] = None,
        input: Optional[Union[str, List]] = None,
        specific_deployment: Optional[bool] = False,
        request_kwargs: Optional[Dict] = None,
    ):
        """
        Async implementation of 'get_available_deployments'.
@ -2846,6 +2848,7 @@ class Router:
                messages=messages,
                input=input,
                specific_deployment=specific_deployment,
                request_kwargs=request_kwargs,
            )
        model, healthy_deployments = self._common_checks_available_deployment(
@ -2949,6 +2952,7 @@ class Router:
        messages: Optional[List[Dict[str, str]]] = None,
        input: Optional[Union[str, List]] = None,
        specific_deployment: Optional[bool] = False,
        request_kwargs: Optional[Dict] = None,
    ):
        """
        Returns the deployment based on routing strategy
@ -3035,7 +3039,9 @@ class Router:
            and self.lowestlatency_logger is not None
        ):
            deployment = self.lowestlatency_logger.get_available_deployments(
-                model_group=model, healthy_deployments=healthy_deployments
+                model_group=model,
                healthy_deployments=healthy_deployments,
                request_kwargs=request_kwargs,
            )
        elif (
            self.routing_strategy == "usage-based-routing"
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -11,6 +11,7 @@ from litellm.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm import ModelResponse
 from litellm import token_counter
 import litellm
 class LiteLLMBase(BaseModel):
@ -126,6 +127,61 @@ class LowestLatencyLoggingHandler(CustomLogger):
            traceback.print_exc()
            pass
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        """
        Check if Timeout Error, if timeout set deployment latency -> 100
        """
        try:
            _exception = kwargs.get("exception", None)
            if isinstance(_exception, litellm.Timeout):
                if kwargs["litellm_params"].get("metadata") is None:
                    pass
                else:
                    model_group = kwargs["litellm_params"]["metadata"].get(
                        "model_group", None
                    )
                    id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                    if model_group is None or id is None:
                        return
                    elif isinstance(id, int):
                        id = str(id)
                    # ------------
                    # Setup values
                    # ------------
                    """
                    {
                        {model_group}_map: {
                            id: {
                                "latency": [..]
                                f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                            }
                        }
                    }
                    """
                    latency_key = f"{model_group}_map"
                    request_count_dict = (
                        self.router_cache.get_cache(key=latency_key) or {}
                    )
                    if id not in request_count_dict:
                        request_count_dict[id] = {}
                    ## Latency
                    request_count_dict[id].setdefault("latency", []).append(100.0)
                    self.router_cache.set_cache(
                        key=latency_key,
                        value=request_count_dict,
                        ttl=self.routing_args.ttl,
                    )  # reset map within window
            else:
                # do nothing if it's not a timeout error
                return
        except Exception as e:
            traceback.print_exc()
            pass
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            """
@ -216,12 +272,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
        healthy_deployments: list,
        messages: Optional[List[Dict[str, str]]] = None,
        input: Optional[Union[str, List]] = None,
        request_kwargs: Optional[Dict] = None,
    ):
        """
        Returns a deployment with the lowest latency
        """
        # get list of potential deployments
        latency_key = f"{model_group}_map"
        _latency_per_deployment = {}
        request_count_dict = self.router_cache.get_cache(key=latency_key) or {}
@ -298,4 +356,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
            elif item_latency < lowest_latency:
                lowest_latency = item_latency
                deployment = _deployment
            # _latency_per_deployment is used for debuggig
            _deployment_api_base = _deployment.get("litellm_params", {}).get(
                "api_base", ""
            )
            _latency_per_deployment[_deployment_api_base] = item_latency
        if request_kwargs is not None and "metadata" in request_kwargs:
            request_kwargs["metadata"][
                "_latency_per_deployment"
            ] = _latency_per_deployment
        return deployment
--- a/litellm/tests/test_lowest_latency_routing.py
+++ b/litellm/tests/test_lowest_latency_routing.py
@ -477,3 +477,81 @@ async def test_router_completion_streaming():
 # asyncio.run(test_router_completion_streaming())
@pytest.mark.asyncio
 async def test_lowest_latency_routing_with_timeouts():
    """
    PROD Test:
    - Endpoint 1: triggers timeout errors (it takes 10+ seconds to respond)
    - Endpoint 2: Responds in under 1s
    - Run 5 requests to collect data on latency
    - Run Wait till cache is filled with data
    - Run 10 more requests
    - All requests should have been routed to endpoint 2
    """
    import litellm
    litellm.set_verbose = True
    router = Router(
        model_list=[
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/slow-endpoint",
                    "api_base": "https://exampleopenaiendpoint-production-c715.up.railway.app/",  # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)
                    "api_key": "fake-key",
                },
                "model_info": {"id": "slow-endpoint"},
            },
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/fast-endpoint",
                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
                    "api_key": "fake-key",
                },
                "model_info": {"id": "fast-endpoint"},
            },
        ],
        routing_strategy="latency-based-routing",
        set_verbose=True,
        debug_level="DEBUG",
        timeout=1,
    )  # type: ignore
    # make 4 requests
    for _ in range(4):
        try:
            response = await router.acompletion(
                model="azure-model", messages=[{"role": "user", "content": "hello"}]
            )
            print(response)
        except Exception as e:
            print("got exception", e)
    await asyncio.sleep(1)
    print("done sending initial requests to collect latency")
    """
    Note: for debugging
    - By this point: slow-endpoint should have timed out 3-4 times and should be heavily penalized :)
    - The next 10 requests should all be routed to the fast-endpoint
    """
    deployments = {}
    # make 10 requests
    for _ in range(10):
        response = await router.acompletion(
            model="azure-model", messages=[{"role": "user", "content": "hello"}]
        )
        print(response)
        _picked_model_id = response._hidden_params["model_id"]
        if _picked_model_id not in deployments:
            deployments[_picked_model_id] = 1
        else:
            deployments[_picked_model_id] += 1
    print("deployments", deployments)
    # ALL the Requests should have been routed to the fast-endpoint
    assert deployments["fast-endpoint"] == 10