From ecd182eb6aed5e59ca294c77c4b32ff1bcb9118f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 19:44:41 -0700
Subject: [PATCH 1/7] feat(router.py): support fastest response batch
 completion call

returns fastest response. cancels others.
---
 litellm/router.py                             | 73 ++++++++++++++++++-
 litellm/tests/test_router_batch_completion.py | 46 ++++++++----
 2 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/litellm/router.py b/litellm/router.py
index e2ebea37f..631360da6 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -356,7 +356,8 @@ class Router:
                 raise ValueError(f"Item '{fallback_dict}' is not a dictionary.")
             if len(fallback_dict) != 1:
                 raise ValueError(
-                    f"Dictionary '{fallback_dict}' must have exactly one key, but has {len(fallback_dict)} keys.")
+                    f"Dictionary '{fallback_dict}' must have exactly one key, but has {len(fallback_dict)} keys."
+                )
 
     def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
         if routing_strategy == "least-busy":
@@ -737,6 +738,76 @@ class Router:
         response = await asyncio.gather(*_tasks)
         return response
 
+    # fmt: off
+
+    @overload
+    async def abatch_completion_fastest_response(
+        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[True], **kwargs
+    ) -> CustomStreamWrapper:
+        ...
+
+
+
+    @overload
+    async def abatch_completion_fastest_response(
+        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs
+    ) -> ModelResponse:
+        ...
+
+    # fmt: on
+
+    async def abatch_completion_fastest_response(
+        self,
+        models: List[str],
+        messages: List[Dict[str, str]],
+        stream: bool = False,
+        **kwargs,
+    ):
+        """Send 1 completion call to many models: Return Fastest Response."""
+
+        async def _async_completion_no_exceptions(
+            model: str, messages: List[Dict[str, str]], **kwargs
+        ):
+            """
+            Wrapper around self.async_completion that catches exceptions and returns them as a result
+            """
+            try:
+                return await self.acompletion(model=model, messages=messages, **kwargs)
+            except Exception as e:
+                return e
+
+        _tasks = []
+        pending_tasks = []  # type: ignore
+
+        async def check_response(task):
+            nonlocal pending_tasks
+            result = await task
+            if isinstance(result, (ModelResponse, CustomStreamWrapper)):
+                # If a desired response is received, cancel all other pending tasks
+                for t in pending_tasks:
+                    t.cancel()
+                return result
+            else:
+                try:
+                    pending_tasks.remove(task)
+                except Exception as e:
+                    pass
+
+        for model in models:
+            task = asyncio.create_task(
+                _async_completion_no_exceptions(
+                    model=model, messages=messages, **kwargs
+                )
+            )
+            task.add_done_callback(check_response)
+            _tasks.append(task)
+            pending_tasks.append(task)
+
+        responses = await asyncio.gather(*_tasks, return_exceptions=True)
+        if isinstance(responses[0], Exception):
+            raise responses[0]
+        return responses[0]  # return first value from list
+
     def image_generation(self, prompt: str, model: str, **kwargs):
         try:
             kwargs["model"] = model
diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py
index f2873b18d..219881dcb 100644
--- a/litellm/tests/test_router_batch_completion.py
+++ b/litellm/tests/test_router_batch_completion.py
@@ -19,8 +19,9 @@ import os, httpx
 load_dotenv()
 
 
+@pytest.mark.parametrize("mode", ["all_responses", "fastest_response"])
 @pytest.mark.asyncio
-async def test_batch_completion_multiple_models():
+async def test_batch_completion_multiple_models(mode):
     litellm.set_verbose = True
 
     router = litellm.Router(
@@ -40,21 +41,34 @@ async def test_batch_completion_multiple_models():
         ]
     )
 
-    response = await router.abatch_completion(
-        models=["gpt-3.5-turbo", "groq-llama"],
-        messages=[
-            {"role": "user", "content": "is litellm becoming a better product ?"}
-        ],
-        max_tokens=15,
-    )
+    if mode == "all_responses":
+        response = await router.abatch_completion(
+            models=["gpt-3.5-turbo", "groq-llama"],
+            messages=[
+                {"role": "user", "content": "is litellm becoming a better product ?"}
+            ],
+            max_tokens=15,
+        )
 
-    print(response)
-    assert len(response) == 2
+        print(response)
+        assert len(response) == 2
 
-    models_in_responses = []
-    for individual_response in response:
-        _model = individual_response["model"]
-        models_in_responses.append(_model)
+        models_in_responses = []
+        for individual_response in response:
+            _model = individual_response["model"]
+            models_in_responses.append(_model)
 
-    # assert both models are different
-    assert models_in_responses[0] != models_in_responses[1]
+        # assert both models are different
+        assert models_in_responses[0] != models_in_responses[1]
+    elif mode == "fastest_response":
+        from openai.types.chat.chat_completion import ChatCompletion
+
+        response = await router.abatch_completion_fastest_response(
+            models=["gpt-3.5-turbo", "groq-llama"],
+            messages=[
+                {"role": "user", "content": "is litellm becoming a better product ?"}
+            ],
+            max_tokens=15,
+        )
+
+        ChatCompletion.model_validate(response.model_dump(), strict=True)

From 20106715d59c73487644117ffbccf41ae35aa691 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 20:09:31 -0700
Subject: [PATCH 2/7] feat(proxy_server.py): enable batch completion fastest
 response calls on proxy

introduces new `fastest_response` flag for enabling the call
---
 litellm/main.py               |  1 +
 litellm/proxy/proxy_server.py | 23 ++++++++++++++++++++++-
 litellm/router.py             | 11 +++++++++--
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 5da2b4a52..cb197aef8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -680,6 +680,7 @@ def completion(
         "region_name",
         "allowed_model_region",
         "model_config",
+        "fastest_response",
     ]
 
     default_params = openai_params + litellm_params
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 6efcb2a70..ee1cd7a64 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -415,6 +415,7 @@ def get_custom_headers(
     api_base: Optional[str] = None,
     version: Optional[str] = None,
     model_region: Optional[str] = None,
+    fastest_response_batch_completion: Optional[bool] = None,
 ) -> dict:
     exclude_values = {"", None}
     headers = {
@@ -425,6 +426,11 @@ def get_custom_headers(
         "x-litellm-model-region": model_region,
         "x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit),
         "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
+        "x-litellm-fastest_response_batch_completion": (
+            str(fastest_response_batch_completion)
+            if fastest_response_batch_completion is not None
+            else None
+        ),
     }
     try:
         return {
@@ -4035,7 +4041,17 @@ async def chat_completion(
         elif "," in data["model"] and llm_router is not None:
             _models_csv_string = data.pop("model")
             _models = _models_csv_string.split(",")
-            tasks.append(llm_router.abatch_completion(models=_models, **data))
+            if (
+                data.get("fastest_response", None) is not None
+                and data["fastest_response"] == True
+            ):
+                tasks.append(
+                    llm_router.abatch_completion_fastest_response(
+                        models=_models, **data
+                    )
+                )
+            else:
+                tasks.append(llm_router.abatch_completion(models=_models, **data))
         elif "user_config" in data:
             # initialize a new router instance. make request using this Router
             router_config = data.pop("user_config")
@@ -4085,6 +4101,9 @@ async def chat_completion(
         model_id = hidden_params.get("model_id", None) or ""
         cache_key = hidden_params.get("cache_key", None) or ""
         api_base = hidden_params.get("api_base", None) or ""
+        fastest_response_batch_completion = hidden_params.get(
+            "fastest_response_batch_completion", None
+        )
 
         # Post Call Processing
         if llm_router is not None:
@@ -4101,6 +4120,7 @@ async def chat_completion(
                 api_base=api_base,
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+                fastest_response_batch_completion=fastest_response_batch_completion,
             )
             selected_data_generator = select_data_generator(
                 response=response,
@@ -4121,6 +4141,7 @@ async def chat_completion(
                 api_base=api_base,
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+                fastest_response_batch_completion=fastest_response_batch_completion,
             )
         )
 
diff --git a/litellm/router.py b/litellm/router.py
index 631360da6..b87d0dded 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -804,9 +804,16 @@ class Router:
             pending_tasks.append(task)
 
         responses = await asyncio.gather(*_tasks, return_exceptions=True)
-        if isinstance(responses[0], Exception):
+        if isinstance(responses[0], Exception) or isinstance(
+            responses[0], BaseException
+        ):
             raise responses[0]
-        return responses[0]  # return first value from list
+        _response: Union[ModelResponse, CustomStreamWrapper] = responses[
+            0
+        ]  # return first value from list
+
+        _response._hidden_params["fastest_response_batch_completion"] = True
+        return _response
 
     def image_generation(self, prompt: str, model: str, **kwargs):
         try:

From 1ebae6e7b0f0f736ace30ff056612f06b82f6348 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 21:34:37 -0700
Subject: [PATCH 3/7] fix(router.py): support comma-separated model list for
 batch completion fastest response

---
 litellm/proxy/_super_secret_config.yaml       |  2 +-
 litellm/proxy/proxy_server.py                 | 10 +--
 litellm/router.py                             | 77 +++++++++++--------
 litellm/tests/test_router_batch_completion.py | 44 ++++++++++-
 4 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml
index ca108e631..f0a7ba827 100644
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@@ -36,7 +36,7 @@ model_list:
     api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
     api_key: os.environ/AZURE_EUROPE_API_KEY
     model: azure/gpt-35-turbo
-  model_name: gpt-3.5-turbo
+  model_name: gpt-3.5-turbo-fake-model
 - litellm_params:
     api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
     api_key: os.environ/AZURE_API_KEY
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index ee1cd7a64..083452089 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -4039,18 +4039,14 @@ async def chat_completion(
         if "api_key" in data:
             tasks.append(litellm.acompletion(**data))
         elif "," in data["model"] and llm_router is not None:
-            _models_csv_string = data.pop("model")
-            _models = _models_csv_string.split(",")
             if (
                 data.get("fastest_response", None) is not None
                 and data["fastest_response"] == True
             ):
-                tasks.append(
-                    llm_router.abatch_completion_fastest_response(
-                        models=_models, **data
-                    )
-                )
+                tasks.append(llm_router.abatch_completion_fastest_response(**data))
             else:
+                _models_csv_string = data.pop("model")
+                _models = [model.strip() for model in _models_csv_string.split(",")]
                 tasks.append(llm_router.abatch_completion(models=_models, **data))
         elif "user_config" in data:
             # initialize a new router instance. make request using this Router
diff --git a/litellm/router.py b/litellm/router.py
index b87d0dded..1ed6854cd 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -742,7 +742,7 @@ class Router:
 
     @overload
     async def abatch_completion_fastest_response(
-        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[True], **kwargs
+        self, model: str, messages: List[Dict[str, str]], stream: Literal[True], **kwargs
     ) -> CustomStreamWrapper:
         ...
 
@@ -750,7 +750,7 @@ class Router:
 
     @overload
     async def abatch_completion_fastest_response(
-        self, models: List[str], messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs
+        self, model: str, messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs
     ) -> ModelResponse:
         ...
 
@@ -758,39 +758,56 @@ class Router:
 
     async def abatch_completion_fastest_response(
         self,
-        models: List[str],
+        model: str,
         messages: List[Dict[str, str]],
         stream: bool = False,
         **kwargs,
     ):
-        """Send 1 completion call to many models: Return Fastest Response."""
+        """
+        model - List of comma-separated model names. E.g. model="gpt-4, gpt-3.5-turbo"
+
+        Returns fastest response from list of model names. OpenAI-compatible endpoint.
+        """
+        models = [m.strip() for m in model.split(",")]
 
         async def _async_completion_no_exceptions(
-            model: str, messages: List[Dict[str, str]], **kwargs
-        ):
+            model: str, messages: List[Dict[str, str]], **kwargs: Any
+        ) -> Union[ModelResponse, CustomStreamWrapper, Exception]:
             """
-            Wrapper around self.async_completion that catches exceptions and returns them as a result
+            Wrapper around self.acompletion that catches exceptions and returns them as a result
             """
             try:
                 return await self.acompletion(model=model, messages=messages, **kwargs)
+            except asyncio.CancelledError:
+                verbose_router_logger.debug(
+                    "Received 'task.cancel'. Cancelling call w/ model={}.".format(model)
+                )
+                raise
             except Exception as e:
                 return e
 
-        _tasks = []
         pending_tasks = []  # type: ignore
 
-        async def check_response(task):
+        async def check_response(task: asyncio.Task):
             nonlocal pending_tasks
-            result = await task
-            if isinstance(result, (ModelResponse, CustomStreamWrapper)):
-                # If a desired response is received, cancel all other pending tasks
-                for t in pending_tasks:
-                    t.cancel()
-                return result
-            else:
+            try:
+                result = await task
+                if isinstance(result, (ModelResponse, CustomStreamWrapper)):
+                    verbose_router_logger.debug(
+                        "Received successful response. Cancelling other LLM API calls."
+                    )
+                    # If a desired response is received, cancel all other pending tasks
+                    for t in pending_tasks:
+                        t.cancel()
+                    return result
+            except Exception:
+                # Ignore exceptions, let the loop handle them
+                pass
+            finally:
+                # Remove the task from pending tasks if it finishes
                 try:
                     pending_tasks.remove(task)
-                except Exception as e:
+                except KeyError:
                     pass
 
         for model in models:
@@ -799,21 +816,22 @@ class Router:
                     model=model, messages=messages, **kwargs
                 )
             )
-            task.add_done_callback(check_response)
-            _tasks.append(task)
             pending_tasks.append(task)
 
-        responses = await asyncio.gather(*_tasks, return_exceptions=True)
-        if isinstance(responses[0], Exception) or isinstance(
-            responses[0], BaseException
-        ):
-            raise responses[0]
-        _response: Union[ModelResponse, CustomStreamWrapper] = responses[
-            0
-        ]  # return first value from list
+        # Await the first task to complete successfully
+        while pending_tasks:
+            done, pending_tasks = await asyncio.wait(  # type: ignore
+                pending_tasks, return_when=asyncio.FIRST_COMPLETED
+            )
+            for completed_task in done:
+                result = await check_response(completed_task)
+                if result is not None:
+                    # Return the first successful result
+                    result._hidden_params["fastest_response_batch_completion"] = True
+                    return result
 
-        _response._hidden_params["fastest_response_batch_completion"] = True
-        return _response
+        # If we exit the loop without returning, all tasks failed
+        raise Exception("All tasks failed")
 
     def image_generation(self, prompt: str, model: str, **kwargs):
         try:
@@ -3624,7 +3642,6 @@ class Router:
         ## get healthy deployments
         ### get all deployments
         healthy_deployments = [m for m in self.model_list if m["model_name"] == model]
-
         if len(healthy_deployments) == 0:
             # check if the user sent in a deployment name instead
             healthy_deployments = [
diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py
index 219881dcb..c74892814 100644
--- a/litellm/tests/test_router_batch_completion.py
+++ b/litellm/tests/test_router_batch_completion.py
@@ -64,7 +64,7 @@ async def test_batch_completion_multiple_models(mode):
         from openai.types.chat.chat_completion import ChatCompletion
 
         response = await router.abatch_completion_fastest_response(
-            models=["gpt-3.5-turbo", "groq-llama"],
+            model="gpt-3.5-turbo, groq-llama",
             messages=[
                 {"role": "user", "content": "is litellm becoming a better product ?"}
             ],
@@ -72,3 +72,45 @@ async def test_batch_completion_multiple_models(mode):
         )
 
         ChatCompletion.model_validate(response.model_dump(), strict=True)
+
+
+@pytest.mark.asyncio
+async def test_batch_completion_fastest_response_unit_test():
+    """
+    Unit test to confirm fastest response will always return the response which arrives earliest.
+
+    2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned
+    """
+    litellm.set_verbose = True
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4",
+                },
+                "model_info": {"id": "1"},
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "mock_response": "This is a fake response",
+                },
+                "model_info": {"id": "2"},
+            },
+        ]
+    )
+
+    response = await router.abatch_completion_fastest_response(
+        model="gpt-4, gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": "is litellm becoming a better product ?"}
+        ],
+        max_tokens=500,
+    )
+
+    assert response._hidden_params["model_id"] == "2"
+    assert response.choices[0].message.content == "This is a fake response"
+    print(f"response: {response}")

From f168e356293ea195d1f88bf6c70ec05545cd89c5 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 21:39:09 -0700
Subject: [PATCH 4/7] build(config.yml): add pillow to ci/cd

---
 .circleci/config.yml | 1 +
 litellm/main.py      | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 516f2b20d..27f79ed51 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -61,6 +61,7 @@ jobs:
             pip install prometheus-client==0.20.0
             pip install "pydantic==2.7.1"
             pip install "diskcache==5.6.1"
+            pip install "Pillow==10.3.0"
       - save_cache:
           paths:
             - ./venv
diff --git a/litellm/main.py b/litellm/main.py
index cb197aef8..a7fbbfa69 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -14,7 +14,6 @@ from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
-
 import litellm
 from ._logging import verbose_logger
 from litellm import (  # type: ignore

From e3000504f9f61eeae75f2e91804ace9b061647ee Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 21:51:09 -0700
Subject: [PATCH 5/7] fix(router.py): support batch completions fastest
 response streaming

---
 litellm/router.py                             |  6 ++--
 litellm/tests/test_router_batch_completion.py | 36 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/litellm/router.py b/litellm/router.py
index 1ed6854cd..3715ec26c 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -771,13 +771,13 @@ class Router:
         models = [m.strip() for m in model.split(",")]
 
         async def _async_completion_no_exceptions(
-            model: str, messages: List[Dict[str, str]], **kwargs: Any
+            model: str, messages: List[Dict[str, str]], stream: bool, **kwargs: Any
         ) -> Union[ModelResponse, CustomStreamWrapper, Exception]:
             """
             Wrapper around self.acompletion that catches exceptions and returns them as a result
             """
             try:
-                return await self.acompletion(model=model, messages=messages, **kwargs)
+                return await self.acompletion(model=model, messages=messages, stream=stream, **kwargs)  # type: ignore
             except asyncio.CancelledError:
                 verbose_router_logger.debug(
                     "Received 'task.cancel'. Cancelling call w/ model={}.".format(model)
@@ -813,7 +813,7 @@ class Router:
         for model in models:
             task = asyncio.create_task(
                 _async_completion_no_exceptions(
-                    model=model, messages=messages, **kwargs
+                    model=model, messages=messages, stream=stream, **kwargs
                 )
             )
             pending_tasks.append(task)
diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py
index c74892814..82fe102e2 100644
--- a/litellm/tests/test_router_batch_completion.py
+++ b/litellm/tests/test_router_batch_completion.py
@@ -114,3 +114,39 @@ async def test_batch_completion_fastest_response_unit_test():
     assert response._hidden_params["model_id"] == "2"
     assert response.choices[0].message.content == "This is a fake response"
     print(f"response: {response}")
+
+
+@pytest.mark.asyncio
+async def test_batch_completion_fastest_response_streaming():
+    litellm.set_verbose = True
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "groq-llama",
+                "litellm_params": {
+                    "model": "groq/llama3-8b-8192",
+                },
+            },
+        ]
+    )
+
+    from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+
+    response = await router.abatch_completion_fastest_response(
+        model="gpt-3.5-turbo, groq-llama",
+        messages=[
+            {"role": "user", "content": "is litellm becoming a better product ?"}
+        ],
+        max_tokens=15,
+        stream=True,
+    )
+
+    async for chunk in response:
+        ChatCompletionChunk.model_validate(chunk.model_dump(), strict=True)

From 2ee599b848302ad71325985c842bc30c319bfca8 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 22:08:06 -0700
Subject: [PATCH 6/7] docs(batching.md): add batch completion to docs

---
 docs/my-website/docs/completion/batching.md | 71 ++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md
index 09f59f743..313356b7e 100644
--- a/docs/my-website/docs/completion/batching.md
+++ b/docs/my-website/docs/completion/batching.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Batching Completion()
 LiteLLM allows you to:
 * Send many completion calls to 1 model
@@ -51,6 +54,9 @@ This makes parallel calls to the specified `models` and returns the first respon
 
 Use this to reduce latency
 
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ### Example Code
 ```python
 import litellm
@@ -68,8 +74,70 @@ response = batch_completion_models(
 print(result)
 ```
 
+
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+[how to setup proxy config](../proxy/configs.md)
+
+Just pass a comma-separated string of model names and the flag `fastest_response=True`.
+
+<Tabs>
+<TabItem value="curl" label="curl">
+
+```bash
+
+curl -X POST 'http://localhost:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \ 
+-D '{
+    "model": "gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models
+    "messages": [
+      {
+        "role": "user",
+        "content": "What'\''s the weather like in Boston today?"
+      }
+    ],
+    "stream": true,
+    "fastest_response": true # 👈 FLAG
+}
+
+'
+```
+
+</TabItem>
+<TabItem value="openai" label="OpenAI SDK">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={"fastest_response": true} # 👈 FLAG
+)
+
+print(response)
+```
+
+</TabItem>
+</Tabs>
+</TabItem>
+</Tabs>
+
 ### Output
-Returns the first response
+Returns the first response in OpenAI format. Cancels other LLM API calls. 
 ```json
 {
   "object": "chat.completion",
@@ -95,6 +163,7 @@ Returns the first response
 }
 ```
 
+
 ## Send 1 completion call to many models: Return All Responses
 This makes parallel calls to the specified models and returns all responses
 

From 3db30ecb4c18ca9b6434cf62abd41d461ac3c535 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 28 May 2024 22:14:22 -0700
Subject: [PATCH 7/7] docs(batching.md): add batch completion fastest response
 on proxy to docs

---
 docs/my-website/docs/completion/batching.md | 31 ++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md
index 313356b7e..5854f4db8 100644
--- a/docs/my-website/docs/completion/batching.md
+++ b/docs/my-website/docs/completion/batching.md
@@ -79,7 +79,7 @@ print(result)
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 
-[how to setup proxy config](../proxy/configs.md)
+[how to setup proxy config](#example-setup)
 
 Just pass a comma-separated string of model names and the flag `fastest_response=True`.
 
@@ -92,11 +92,11 @@ curl -X POST 'http://localhost:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \ 
 -D '{
-    "model": "gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models
+    "model": "gpt-4o, groq-llama", # 👈 Comma-separated models
     "messages": [
       {
         "role": "user",
-        "content": "What'\''s the weather like in Boston today?"
+        "content": "What's the weather like in Boston today?"
       }
     ],
     "stream": true,
@@ -118,7 +118,7 @@ client = openai.OpenAI(
 
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(
-    model="gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models
+    model="gpt-4o, groq-llama", # 👈 Comma-separated models
     messages = [
         {
             "role": "user",
@@ -133,6 +133,29 @@ print(response)
 
 </TabItem>
 </Tabs>
+
+---
+
+### Example Setup: 
+
+```yaml 
+model_list: 
+- model_name: groq-llama
+  litellm_params:
+    model: groq/llama3-8b-8192
+    api_key: os.environ/GROQ_API_KEY
+- model_name: gpt-4o
+  litellm_params:
+    model: gpt-4o
+    api_key: os.environ/OPENAI_API_KEY
+```
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
 </TabItem>
 </Tabs>