From 9156b7448aef8932517af744ec3bb20c4edc5f82 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:08:16 -0700
Subject: [PATCH 1/7] feat - router async batch acompletion

---
 litellm/router.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/litellm/router.py b/litellm/router.py
index f0d94908e..7396dab20 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -605,6 +605,33 @@ class Router:
                 self.fail_calls[model_name] += 1
             raise e
 
+    async def abatch_completion(
+        self, models: List[str], messages: List[Dict[str, str]], **kwargs
+    ):
+
+        async def _async_completion_no_exceptions(
+            model: str, messages: List[Dict[str, str]], **kwargs
+        ):
+            """
+            Wrapper around self.async_completion that catches exceptions and returns them as a result
+            """
+            try:
+                return await self.acompletion(model=model, messages=messages, **kwargs)
+            except Exception as e:
+                return e
+
+        _tasks = []
+        for model in models:
+            # add each task but if the task fails
+            _tasks.append(
+                _async_completion_no_exceptions(
+                    model=model, messages=messages, **kwargs
+                )
+            )
+
+        response = await asyncio.gather(*_tasks)
+        return response
+
     def image_generation(self, prompt: str, model: str, **kwargs):
         try:
             kwargs["model"] = model

From 6561e0838ea36695847d38f648b81c46d225ce08 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:09:17 -0700
Subject: [PATCH 2/7] test - router.batch_acompletion

---
 litellm/tests/test_router_batch_completion.py | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 litellm/tests/test_router_batch_completion.py

diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py
new file mode 100644
index 000000000..f2873b18d
--- /dev/null
+++ b/litellm/tests/test_router_batch_completion.py
@@ -0,0 +1,60 @@
+#### What this tests ####
+# This tests litellm router with batch completion
+
+import sys, os, time, openai
+import traceback, asyncio
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import Router
+from litellm.router import Deployment, LiteLLM_Params, ModelInfo
+from concurrent.futures import ThreadPoolExecutor
+from collections import defaultdict
+from dotenv import load_dotenv
+import os, httpx
+
+load_dotenv()
+
+
+@pytest.mark.asyncio
+async def test_batch_completion_multiple_models():
+    litellm.set_verbose = True
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "groq-llama",
+                "litellm_params": {
+                    "model": "groq/llama3-8b-8192",
+                },
+            },
+        ]
+    )
+
+    response = await router.abatch_completion(
+        models=["gpt-3.5-turbo", "groq-llama"],
+        messages=[
+            {"role": "user", "content": "is litellm becoming a better product ?"}
+        ],
+        max_tokens=15,
+    )
+
+    print(response)
+    assert len(response) == 2
+
+    models_in_responses = []
+    for individual_response in response:
+        _model = individual_response["model"]
+        models_in_responses.append(_model)
+
+    # assert both models are different
+    assert models_in_responses[0] != models_in_responses[1]

From b8c7bbcb9f8d10103ebed09f9c73d6d49ca48024 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:24:25 -0700
Subject: [PATCH 3/7] support batch /chat/completions on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 46c132773..0ae498baa 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3639,7 +3639,7 @@ async def chat_completion(
         ### MODEL ALIAS MAPPING ###
         # check if model name in model alias map
         # get the actual model name
-        if data["model"] in litellm.model_alias_map:
+        if isinstance(data["model"], str) and data["model"] in litellm.model_alias_map:
             data["model"] = litellm.model_alias_map[data["model"]]
 
         ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
@@ -3673,6 +3673,9 @@ async def chat_completion(
         # skip router if user passed their key
         if "api_key" in data:
             tasks.append(litellm.acompletion(**data))
+        elif isinstance(data["model"], list) and llm_router is not None:
+            _models = data.pop("model")
+            tasks.append(llm_router.abatch_completion(models=_models, **data))
         elif "user_config" in data:
             # initialize a new router instance. make request using this Router
             router_config = data.pop("user_config")

From 31cb1be27929eec766447616942b5e7b70f2b61d Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:24:59 -0700
Subject: [PATCH 4/7] edit dev config.yaml

---
 litellm/proxy/proxy_config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index b1cbf2e81..85634c5b8 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -4,6 +4,12 @@ model_list:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
+  - model_name: llama3
+    litellm_params:
+      model: groq/llama3-8b-8192
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
   - model_name: "*"
     litellm_params:
       model: openai/*

From e1f94fcbbb3a51107fa14ef09d69132eff92e124 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:32:30 -0700
Subject: [PATCH 5/7] test batch completions on litellm proxy

---
 tests/test_openai_endpoints.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test_openai_endpoints.py b/tests/test_openai_endpoints.py
index 38e87c254..7bc97ca59 100644
--- a/tests/test_openai_endpoints.py
+++ b/tests/test_openai_endpoints.py
@@ -4,6 +4,7 @@ import pytest
 import asyncio
 import aiohttp, openai
 from openai import OpenAI, AsyncOpenAI
+from typing import Optional, List, Union
 
 
 def response_header_check(response):
@@ -71,7 +72,7 @@ async def new_user(session):
         return await response.json()
 
 
-async def chat_completion(session, key, model="gpt-4"):
+async def chat_completion(session, key, model: Union[str, List] = "gpt-4"):
     url = "http://0.0.0.0:4000/chat/completions"
     headers = {
         "Authorization": f"Bearer {key}",
@@ -409,3 +410,27 @@ async def test_openai_wildcard_chat_completion():
 
         # call chat/completions with a model that the key was not created for + the model is not on the config.yaml
         await chat_completion(session=session, key=key, model="gpt-3.5-turbo-0125")
+
+
+@pytest.mark.asyncio
+async def test_batch_chat_completions():
+    """
+    - Make chat completion call using
+
+    """
+    async with aiohttp.ClientSession() as session:
+
+        # call chat/completions with a model that the key was not created for + the model is not on the config.yaml
+        response = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=[
+                "gpt-3.5-turbo",
+                "fake-openai-endpoint",
+            ],
+        )
+
+        print(f"response: {response}")
+
+        assert len(response) == 2
+        assert isinstance(response, list)

From 5918ee543bd9d3523bfc4fdb6c6ceff4384f8a3e Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:42:41 -0700
Subject: [PATCH 6/7] docs - batch completion litellm proxy

---
 docs/my-website/docs/proxy/user_keys.md | 84 +++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md
index fa78b37c1..7aba832eb 100644
--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@@ -365,6 +365,90 @@ curl --location 'http://0.0.0.0:4000/moderations' \
 
 ## Advanced
 
+### (BETA) Batch Completions - pass `model` as List
+
+Use this when you want to send 1 request to N Models
+
+#### Expected Request Format
+
+This same request will be sent to the following model groups on the [litellm proxy config.yaml](https://docs.litellm.ai/docs/proxy/configs)
+- `model_name="llama3"`
+- `model_name="gpt-3.5-turbo"` 
+
+```shell
+curl --location 'http://localhost:4000/chat/completions' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": ["llama3", "gpt-3.5-turbo"],
+    "max_tokens": 10,
+    "user": "litellm2",
+    "messages": [
+        {
+        "role": "user",
+        "content": "is litellm getting better"
+        }
+    ]
+}'
+```
+
+
+#### Expected Response Format
+
+Get a list of responses when `model` is passed as a list
+
+```json
+[
+  {
+    "id": "chatcmpl-3dbd5dd8-7c82-4ca3-bf1f-7c26f497cf2b",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "The Elder Scrolls IV: Oblivion!\n\nReleased",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459876,
+    "model": "groq/llama3-8b-8192",
+    "object": "chat.completion",
+    "system_fingerprint": "fp_179b0f92c9",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 12,
+      "total_tokens": 22
+    }
+  },
+  {
+    "id": "chatcmpl-9NnldUfFLmVquFHSX4yAtjCw8PGei",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "TES4 could refer to The Elder Scrolls IV:",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459877,
+    "model": "gpt-3.5-turbo-0125",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 9,
+      "total_tokens": 19
+    }
+  }
+]
+```
+
+
+
+
 ### Pass User LLM API Keys, Fallbacks
 Allow your end-users to pass their model list, api base, OpenAI API key (any LiteLLM supported provider) to make requests 
 

From 62276fc22102aafbc9ea91d0f978a39b252ce75a Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 11 May 2024 13:45:32 -0700
Subject: [PATCH 7/7] docs link to litellm batch completions

---
 docs/my-website/docs/completion/batching.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md
index 05683b3dd..09f59f743 100644
--- a/docs/my-website/docs/completion/batching.md
+++ b/docs/my-website/docs/completion/batching.md
@@ -4,6 +4,12 @@ LiteLLM allows you to:
 * Send 1 completion call to many models: Return Fastest Response
 * Send 1 completion call to many models: Return All Responses
 
+:::info
+
+Trying to do batch completion on LiteLLM Proxy ? Go here: https://docs.litellm.ai/docs/proxy/user_keys#beta-batch-completions---pass-model-as-list
+
+:::
+
 ## Send multiple completion calls to 1 model
 
 In the batch_completion method, you provide a list of `messages` where each sub-list of messages is passed to `litellm.completion()`, allowing you to process multiple prompts efficiently in a single API call.