Merge pull request #3585 from BerriAI/litellm_router_batch_comp

[Litellm Proxy + litellm.Router] - Pass the same message/prompt to N models
2024-05-11 13:51:45 -07:00 · 2024-05-11 13:51:45 -07:00 · bf909a89f8
commit bf909a89f8
parent 71564895ae 62276fc221
7 changed files with 213 additions and 2 deletions
--- a/docs/my-website/docs/completion/batching.md
+++ b/docs/my-website/docs/completion/batching.md
@ -4,6 +4,12 @@ LiteLLM allows you to:
 * Send 1 completion call to many models: Return Fastest Response
 * Send 1 completion call to many models: Return All Responses

+:::info
+
+Trying to do batch completion on LiteLLM Proxy ? Go here: https://docs.litellm.ai/docs/proxy/user_keys#beta-batch-completions---pass-model-as-list
+
+:::
+
 ## Send multiple completion calls to 1 model

 In the batch_completion method, you provide a list of `messages` where each sub-list of messages is passed to `litellm.completion()`, allowing you to process multiple prompts efficiently in a single API call.
--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@ -365,6 +365,90 @@ curl --location 'http://0.0.0.0:4000/moderations' \

 ## Advanced

+### (BETA) Batch Completions - pass `model` as List
+
+Use this when you want to send 1 request to N Models
+
+#### Expected Request Format
+
+This same request will be sent to the following model groups on the [litellm proxy config.yaml](https://docs.litellm.ai/docs/proxy/configs)
+- `model_name="llama3"`
+- `model_name="gpt-3.5-turbo"` 
+
+```shell
+curl --location 'http://localhost:4000/chat/completions' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": ["llama3", "gpt-3.5-turbo"],
+    "max_tokens": 10,
+    "user": "litellm2",
+    "messages": [
+        {
+        "role": "user",
+        "content": "is litellm getting better"
+        }
+    ]
+}'
+```
+
+
+#### Expected Response Format
+
+Get a list of responses when `model` is passed as a list
+
+```json
+[
+  {
+    "id": "chatcmpl-3dbd5dd8-7c82-4ca3-bf1f-7c26f497cf2b",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "The Elder Scrolls IV: Oblivion!\n\nReleased",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459876,
+    "model": "groq/llama3-8b-8192",
+    "object": "chat.completion",
+    "system_fingerprint": "fp_179b0f92c9",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 12,
+      "total_tokens": 22
+    }
+  },
+  {
+    "id": "chatcmpl-9NnldUfFLmVquFHSX4yAtjCw8PGei",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "TES4 could refer to The Elder Scrolls IV:",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459877,
+    "model": "gpt-3.5-turbo-0125",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 9,
+      "total_tokens": 19
+    }
+  }
+]
+```
+
+
+
+
 ### Pass User LLM API Keys, Fallbacks
 Allow your end-users to pass their model list, api base, OpenAI API key (any LiteLLM supported provider) to make requests 

--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -4,6 +4,12 @@ model_list:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+  - model_name: llama3
+    litellm_params:
+      model: groq/llama3-8b-8192
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -3656,7 +3656,7 @@ async def chat_completion(
        ### MODEL ALIAS MAPPING ###
        # check if model name in model alias map
        # get the actual model name
-        if data["model"] in litellm.model_alias_map:
+        if isinstance(data["model"], str) and data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]

        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
@ -3690,6 +3690,9 @@ async def chat_completion(
        # skip router if user passed their key
        if "api_key" in data:
            tasks.append(litellm.acompletion(**data))
+        elif isinstance(data["model"], list) and llm_router is not None:
+            _models = data.pop("model")
+            tasks.append(llm_router.abatch_completion(models=_models, **data))
        elif "user_config" in data:
            # initialize a new router instance. make request using this Router
            router_config = data.pop("user_config")
--- a/litellm/router.py
+++ b/litellm/router.py
@ -606,6 +606,33 @@ class Router:
                self.fail_calls[model_name] += 1
            raise e

+    async def abatch_completion(
+        self, models: List[str], messages: List[Dict[str, str]], **kwargs
+    ):
+
+        async def _async_completion_no_exceptions(
+            model: str, messages: List[Dict[str, str]], **kwargs
+        ):
+            """
+            Wrapper around self.async_completion that catches exceptions and returns them as a result
+            """
+            try:
+                return await self.acompletion(model=model, messages=messages, **kwargs)
+            except Exception as e:
+                return e
+
+        _tasks = []
+        for model in models:
+            # add each task but if the task fails
+            _tasks.append(
+                _async_completion_no_exceptions(
+                    model=model, messages=messages, **kwargs
+                )
+            )
+
+        response = await asyncio.gather(*_tasks)
+        return response
+
    def image_generation(self, prompt: str, model: str, **kwargs):
        try:
            kwargs["model"] = model
--- a/litellm/tests/test_router_batch_completion.py
+++ b/litellm/tests/test_router_batch_completion.py
@ -0,0 +1,60 @@
+#### What this tests ####
+# This tests litellm router with batch completion
+
+import sys, os, time, openai
+import traceback, asyncio
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import Router
+from litellm.router import Deployment, LiteLLM_Params, ModelInfo
+from concurrent.futures import ThreadPoolExecutor
+from collections import defaultdict
+from dotenv import load_dotenv
+import os, httpx
+
+load_dotenv()
+
+
+@pytest.mark.asyncio
+async def test_batch_completion_multiple_models():
+    litellm.set_verbose = True
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "groq-llama",
+                "litellm_params": {
+                    "model": "groq/llama3-8b-8192",
+                },
+            },
+        ]
+    )
+
+    response = await router.abatch_completion(
+        models=["gpt-3.5-turbo", "groq-llama"],
+        messages=[
+            {"role": "user", "content": "is litellm becoming a better product ?"}
+        ],
+        max_tokens=15,
+    )
+
+    print(response)
+    assert len(response) == 2
+
+    models_in_responses = []
+    for individual_response in response:
+        _model = individual_response["model"]
+        models_in_responses.append(_model)
+
+    # assert both models are different
+    assert models_in_responses[0] != models_in_responses[1]
--- a/tests/test_openai_endpoints.py
+++ b/tests/test_openai_endpoints.py
@ -4,6 +4,7 @@ import pytest
 import asyncio
 import aiohttp, openai
 from openai import OpenAI, AsyncOpenAI
+from typing import Optional, List, Union


 def response_header_check(response):
@ -71,7 +72,7 @@ async def new_user(session):
        return await response.json()


-async def chat_completion(session, key, model="gpt-4"):
+async def chat_completion(session, key, model: Union[str, List] = "gpt-4"):
    url = "http://0.0.0.0:4000/chat/completions"
    headers = {
        "Authorization": f"Bearer {key}",
@ -409,3 +410,27 @@ async def test_openai_wildcard_chat_completion():

        # call chat/completions with a model that the key was not created for + the model is not on the config.yaml
        await chat_completion(session=session, key=key, model="gpt-3.5-turbo-0125")
+
+
+@pytest.mark.asyncio
+async def test_batch_chat_completions():
+    """
+    - Make chat completion call using
+
+    """
+    async with aiohttp.ClientSession() as session:
+
+        # call chat/completions with a model that the key was not created for + the model is not on the config.yaml
+        response = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=[
+                "gpt-3.5-turbo",
+                "fake-openai-endpoint",
+            ],
+        )
+
+        print(f"response: {response}")
+
+        assert len(response) == 2
+        assert isinstance(response, list)