fix(router.py): support comma-separated model list for batch completion fastest response

2024-05-28 21:34:37 -07:00 · 2024-05-28 21:34:37 -07:00 · 1ebae6e7b0
commit 1ebae6e7b0
parent 20106715d5
4 changed files with 94 additions and 39 deletions
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -36,7 +36,7 @@ model_list:
    api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
    api_key: os.environ/AZURE_EUROPE_API_KEY
    model: azure/gpt-35-turbo
-  model_name: gpt-3.5-turbo
+  model_name: gpt-3.5-turbo-fake-model
 - litellm_params:
    api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
    api_key: os.environ/AZURE_API_KEY
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -4039,18 +4039,14 @@ async def chat_completion(
        if "api_key" in data:
            tasks.append(litellm.acompletion(**data))
        elif "," in data["model"] and llm_router is not None:
-            _models_csv_string = data.pop("model")
-            _models = _models_csv_string.split(",")
            if (
                data.get("fastest_response", None) is not None
                and data["fastest_response"] == True
            ):
-                tasks.append(
-                    llm_router.abatch_completion_fastest_response(
-                        models=_models, **data
-                    )
-                )
+                tasks.append(llm_router.abatch_completion_fastest_response(**data))
            else:
+                _models_csv_string = data.pop("model")
+                _models = [model.strip() for model in _models_csv_string.split(",")]
                tasks.append(llm_router.abatch_completion(models=_models, **data))
        elif "user_config" in data:
            # initialize a new router instance. make request using this Router
--- a/litellm/router.py
+++ b/litellm/router.py
@ -742,7 +742,7 @@ class Router:

    @overload
    async def abatch_completion_fastest_response(
-        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[True], **kwargs
+        self, model: str, messages: List[Dict[str, str]], stream: Literal[True], **kwargs
    ) -> CustomStreamWrapper:
        ...

@ -750,7 +750,7 @@ class Router:

    @overload
    async def abatch_completion_fastest_response(
-        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs
+        self, model: str, messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs
    ) -> ModelResponse:
        ...

@ -758,39 +758,56 @@ class Router:

    async def abatch_completion_fastest_response(
        self,
-        models: List[str],
+        model: str,
        messages: List[Dict[str, str]],
        stream: bool = False,
        **kwargs,
    ):
-        """Send 1 completion call to many models: Return Fastest Response."""
+        """
+        model - List of comma-separated model names. E.g. model="gpt-4, gpt-3.5-turbo"
+
+        Returns fastest response from list of model names. OpenAI-compatible endpoint.
+        """
+        models = [m.strip() for m in model.split(",")]

        async def _async_completion_no_exceptions(
-            model: str, messages: List[Dict[str, str]], **kwargs
-        ):
+            model: str, messages: List[Dict[str, str]], **kwargs: Any
+        ) -> Union[ModelResponse, CustomStreamWrapper, Exception]:
            """
-            Wrapper around self.async_completion that catches exceptions and returns them as a result
+            Wrapper around self.acompletion that catches exceptions and returns them as a result
            """
            try:
                return await self.acompletion(model=model, messages=messages, **kwargs)
+            except asyncio.CancelledError:
+                verbose_router_logger.debug(
+                    "Received 'task.cancel'. Cancelling call w/ model={}.".format(model)
+                )
+                raise
            except Exception as e:
                return e

-        _tasks = []
        pending_tasks = []  # type: ignore

-        async def check_response(task):
+        async def check_response(task: asyncio.Task):
            nonlocal pending_tasks
+            try:
                result = await task
                if isinstance(result, (ModelResponse, CustomStreamWrapper)):
+                    verbose_router_logger.debug(
+                        "Received successful response. Cancelling other LLM API calls."
+                    )
                    # If a desired response is received, cancel all other pending tasks
                    for t in pending_tasks:
                        t.cancel()
                    return result
-            else:
+            except Exception:
+                # Ignore exceptions, let the loop handle them
+                pass
+            finally:
+                # Remove the task from pending tasks if it finishes
                try:
                    pending_tasks.remove(task)
-                except Exception as e:
+                except KeyError:
                    pass

        for model in models:
@ -799,21 +816,22 @@ class Router:
                    model=model, messages=messages, **kwargs
                )
            )
-            task.add_done_callback(check_response)
-            _tasks.append(task)
            pending_tasks.append(task)

-        responses = await asyncio.gather(*_tasks, return_exceptions=True)
-        if isinstance(responses[0], Exception) or isinstance(
-            responses[0], BaseException
-        ):
-            raise responses[0]
-        _response: Union[ModelResponse, CustomStreamWrapper] = responses[
-            0
-        ]  # return first value from list
+        # Await the first task to complete successfully
+        while pending_tasks:
+            done, pending_tasks = await asyncio.wait(  # type: ignore
+                pending_tasks, return_when=asyncio.FIRST_COMPLETED
+            )
+            for completed_task in done:
+                result = await check_response(completed_task)
+                if result is not None:
+                    # Return the first successful result
+                    result._hidden_params["fastest_response_batch_completion"] = True
+                    return result

-        _response._hidden_params["fastest_response_batch_completion"] = True
-        return _response
+        # If we exit the loop without returning, all tasks failed
+        raise Exception("All tasks failed")

    def image_generation(self, prompt: str, model: str, **kwargs):
        try:
@ -3624,7 +3642,6 @@ class Router:
        ## get healthy deployments
        ### get all deployments
        healthy_deployments = [m for m in self.model_list if m["model_name"] == model]
-
        if len(healthy_deployments) == 0:
            # check if the user sent in a deployment name instead
            healthy_deployments = [
--- a/litellm/tests/test_router_batch_completion.py
+++ b/litellm/tests/test_router_batch_completion.py
@ -64,7 +64,7 @@ async def test_batch_completion_multiple_models(mode):
        from openai.types.chat.chat_completion import ChatCompletion

        response = await router.abatch_completion_fastest_response(
-            models=["gpt-3.5-turbo", "groq-llama"],
+            model="gpt-3.5-turbo, groq-llama",
            messages=[
                {"role": "user", "content": "is litellm becoming a better product ?"}
            ],
@ -72,3 +72,45 @@ async def test_batch_completion_multiple_models(mode):
        )

        ChatCompletion.model_validate(response.model_dump(), strict=True)
+
+
+@pytest.mark.asyncio
+async def test_batch_completion_fastest_response_unit_test():
+    """
+    Unit test to confirm fastest response will always return the response which arrives earliest.
+
+    2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned
+    """
+    litellm.set_verbose = True
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4",
+                },
+                "model_info": {"id": "1"},
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "mock_response": "This is a fake response",
+                },
+                "model_info": {"id": "2"},
+            },
+        ]
+    )
+
+    response = await router.abatch_completion_fastest_response(
+        model="gpt-4, gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": "is litellm becoming a better product ?"}
+        ],
+        max_tokens=500,
+    )
+
+    assert response._hidden_params["model_id"] == "2"
+    assert response.choices[0].message.content == "This is a fake response"
+    print(f"response: {response}")