From ecd182eb6aed5e59ca294c77c4b32ff1bcb9118f Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 19:44:41 -0700 Subject: [PATCH 1/7] feat(router.py): support fastest response batch completion call returns fastest response. cancels others. --- litellm/router.py | 73 ++++++++++++++++++- litellm/tests/test_router_batch_completion.py | 46 ++++++++---- 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index e2ebea37f..631360da6 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -356,7 +356,8 @@ class Router: raise ValueError(f"Item '{fallback_dict}' is not a dictionary.") if len(fallback_dict) != 1: raise ValueError( - f"Dictionary '{fallback_dict}' must have exactly one key, but has {len(fallback_dict)} keys.") + f"Dictionary '{fallback_dict}' must have exactly one key, but has {len(fallback_dict)} keys." + ) def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict): if routing_strategy == "least-busy": @@ -737,6 +738,76 @@ class Router: response = await asyncio.gather(*_tasks) return response + # fmt: off + + @overload + async def abatch_completion_fastest_response( + self, models: List[str], messages: List[Dict[str, str]], stream: Literal[True], **kwargs + ) -> CustomStreamWrapper: + ... + + + + @overload + async def abatch_completion_fastest_response( + self, models: List[str], messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs + ) -> ModelResponse: + ... + + # fmt: on + + async def abatch_completion_fastest_response( + self, + models: List[str], + messages: List[Dict[str, str]], + stream: bool = False, + **kwargs, + ): + """Send 1 completion call to many models: Return Fastest Response.""" + + async def _async_completion_no_exceptions( + model: str, messages: List[Dict[str, str]], **kwargs + ): + """ + Wrapper around self.async_completion that catches exceptions and returns them as a result + """ + try: + return await self.acompletion(model=model, messages=messages, **kwargs) + except Exception as e: + return e + + _tasks = [] + pending_tasks = [] # type: ignore + + async def check_response(task): + nonlocal pending_tasks + result = await task + if isinstance(result, (ModelResponse, CustomStreamWrapper)): + # If a desired response is received, cancel all other pending tasks + for t in pending_tasks: + t.cancel() + return result + else: + try: + pending_tasks.remove(task) + except Exception as e: + pass + + for model in models: + task = asyncio.create_task( + _async_completion_no_exceptions( + model=model, messages=messages, **kwargs + ) + ) + task.add_done_callback(check_response) + _tasks.append(task) + pending_tasks.append(task) + + responses = await asyncio.gather(*_tasks, return_exceptions=True) + if isinstance(responses[0], Exception): + raise responses[0] + return responses[0] # return first value from list + def image_generation(self, prompt: str, model: str, **kwargs): try: kwargs["model"] = model diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py index f2873b18d..219881dcb 100644 --- a/litellm/tests/test_router_batch_completion.py +++ b/litellm/tests/test_router_batch_completion.py @@ -19,8 +19,9 @@ import os, httpx load_dotenv() +@pytest.mark.parametrize("mode", ["all_responses", "fastest_response"]) @pytest.mark.asyncio -async def test_batch_completion_multiple_models(): +async def test_batch_completion_multiple_models(mode): litellm.set_verbose = True router = litellm.Router( @@ -40,21 +41,34 @@ async def test_batch_completion_multiple_models(): ] ) - response = await router.abatch_completion( - models=["gpt-3.5-turbo", "groq-llama"], - messages=[ - {"role": "user", "content": "is litellm becoming a better product ?"} - ], - max_tokens=15, - ) + if mode == "all_responses": + response = await router.abatch_completion( + models=["gpt-3.5-turbo", "groq-llama"], + messages=[ + {"role": "user", "content": "is litellm becoming a better product ?"} + ], + max_tokens=15, + ) - print(response) - assert len(response) == 2 + print(response) + assert len(response) == 2 - models_in_responses = [] - for individual_response in response: - _model = individual_response["model"] - models_in_responses.append(_model) + models_in_responses = [] + for individual_response in response: + _model = individual_response["model"] + models_in_responses.append(_model) - # assert both models are different - assert models_in_responses[0] != models_in_responses[1] + # assert both models are different + assert models_in_responses[0] != models_in_responses[1] + elif mode == "fastest_response": + from openai.types.chat.chat_completion import ChatCompletion + + response = await router.abatch_completion_fastest_response( + models=["gpt-3.5-turbo", "groq-llama"], + messages=[ + {"role": "user", "content": "is litellm becoming a better product ?"} + ], + max_tokens=15, + ) + + ChatCompletion.model_validate(response.model_dump(), strict=True) From 20106715d59c73487644117ffbccf41ae35aa691 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 20:09:31 -0700 Subject: [PATCH 2/7] feat(proxy_server.py): enable batch completion fastest response calls on proxy introduces new `fastest_response` flag for enabling the call --- litellm/main.py | 1 + litellm/proxy/proxy_server.py | 23 ++++++++++++++++++++++- litellm/router.py | 11 +++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index 5da2b4a52..cb197aef8 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -680,6 +680,7 @@ def completion( "region_name", "allowed_model_region", "model_config", + "fastest_response", ] default_params = openai_params + litellm_params diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 6efcb2a70..ee1cd7a64 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -415,6 +415,7 @@ def get_custom_headers( api_base: Optional[str] = None, version: Optional[str] = None, model_region: Optional[str] = None, + fastest_response_batch_completion: Optional[bool] = None, ) -> dict: exclude_values = {"", None} headers = { @@ -425,6 +426,11 @@ def get_custom_headers( "x-litellm-model-region": model_region, "x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit), "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit), + "x-litellm-fastest_response_batch_completion": ( + str(fastest_response_batch_completion) + if fastest_response_batch_completion is not None + else None + ), } try: return { @@ -4035,7 +4041,17 @@ async def chat_completion( elif "," in data["model"] and llm_router is not None: _models_csv_string = data.pop("model") _models = _models_csv_string.split(",") - tasks.append(llm_router.abatch_completion(models=_models, **data)) + if ( + data.get("fastest_response", None) is not None + and data["fastest_response"] == True + ): + tasks.append( + llm_router.abatch_completion_fastest_response( + models=_models, **data + ) + ) + else: + tasks.append(llm_router.abatch_completion(models=_models, **data)) elif "user_config" in data: # initialize a new router instance. make request using this Router router_config = data.pop("user_config") @@ -4085,6 +4101,9 @@ async def chat_completion( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + fastest_response_batch_completion = hidden_params.get( + "fastest_response_batch_completion", None + ) # Post Call Processing if llm_router is not None: @@ -4101,6 +4120,7 @@ async def chat_completion( api_base=api_base, version=version, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), + fastest_response_batch_completion=fastest_response_batch_completion, ) selected_data_generator = select_data_generator( response=response, @@ -4121,6 +4141,7 @@ async def chat_completion( api_base=api_base, version=version, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), + fastest_response_batch_completion=fastest_response_batch_completion, ) ) diff --git a/litellm/router.py b/litellm/router.py index 631360da6..b87d0dded 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -804,9 +804,16 @@ class Router: pending_tasks.append(task) responses = await asyncio.gather(*_tasks, return_exceptions=True) - if isinstance(responses[0], Exception): + if isinstance(responses[0], Exception) or isinstance( + responses[0], BaseException + ): raise responses[0] - return responses[0] # return first value from list + _response: Union[ModelResponse, CustomStreamWrapper] = responses[ + 0 + ] # return first value from list + + _response._hidden_params["fastest_response_batch_completion"] = True + return _response def image_generation(self, prompt: str, model: str, **kwargs): try: From 1ebae6e7b0f0f736ace30ff056612f06b82f6348 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 21:34:37 -0700 Subject: [PATCH 3/7] fix(router.py): support comma-separated model list for batch completion fastest response --- litellm/proxy/_super_secret_config.yaml | 2 +- litellm/proxy/proxy_server.py | 10 +-- litellm/router.py | 77 +++++++++++-------- litellm/tests/test_router_batch_completion.py | 44 ++++++++++- 4 files changed, 94 insertions(+), 39 deletions(-) diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index ca108e631..f0a7ba827 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -36,7 +36,7 @@ model_list: api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ api_key: os.environ/AZURE_EUROPE_API_KEY model: azure/gpt-35-turbo - model_name: gpt-3.5-turbo + model_name: gpt-3.5-turbo-fake-model - litellm_params: api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ api_key: os.environ/AZURE_API_KEY diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index ee1cd7a64..083452089 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -4039,18 +4039,14 @@ async def chat_completion( if "api_key" in data: tasks.append(litellm.acompletion(**data)) elif "," in data["model"] and llm_router is not None: - _models_csv_string = data.pop("model") - _models = _models_csv_string.split(",") if ( data.get("fastest_response", None) is not None and data["fastest_response"] == True ): - tasks.append( - llm_router.abatch_completion_fastest_response( - models=_models, **data - ) - ) + tasks.append(llm_router.abatch_completion_fastest_response(**data)) else: + _models_csv_string = data.pop("model") + _models = [model.strip() for model in _models_csv_string.split(",")] tasks.append(llm_router.abatch_completion(models=_models, **data)) elif "user_config" in data: # initialize a new router instance. make request using this Router diff --git a/litellm/router.py b/litellm/router.py index b87d0dded..1ed6854cd 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -742,7 +742,7 @@ class Router: @overload async def abatch_completion_fastest_response( - self, models: List[str], messages: List[Dict[str, str]], stream: Literal[True], **kwargs + self, model: str, messages: List[Dict[str, str]], stream: Literal[True], **kwargs ) -> CustomStreamWrapper: ... @@ -750,7 +750,7 @@ class Router: @overload async def abatch_completion_fastest_response( - self, models: List[str], messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs + self, model: str, messages: List[Dict[str, str]], stream: Literal[False] = False, **kwargs ) -> ModelResponse: ... @@ -758,39 +758,56 @@ class Router: async def abatch_completion_fastest_response( self, - models: List[str], + model: str, messages: List[Dict[str, str]], stream: bool = False, **kwargs, ): - """Send 1 completion call to many models: Return Fastest Response.""" + """ + model - List of comma-separated model names. E.g. model="gpt-4, gpt-3.5-turbo" + + Returns fastest response from list of model names. OpenAI-compatible endpoint. + """ + models = [m.strip() for m in model.split(",")] async def _async_completion_no_exceptions( - model: str, messages: List[Dict[str, str]], **kwargs - ): + model: str, messages: List[Dict[str, str]], **kwargs: Any + ) -> Union[ModelResponse, CustomStreamWrapper, Exception]: """ - Wrapper around self.async_completion that catches exceptions and returns them as a result + Wrapper around self.acompletion that catches exceptions and returns them as a result """ try: return await self.acompletion(model=model, messages=messages, **kwargs) + except asyncio.CancelledError: + verbose_router_logger.debug( + "Received 'task.cancel'. Cancelling call w/ model={}.".format(model) + ) + raise except Exception as e: return e - _tasks = [] pending_tasks = [] # type: ignore - async def check_response(task): + async def check_response(task: asyncio.Task): nonlocal pending_tasks - result = await task - if isinstance(result, (ModelResponse, CustomStreamWrapper)): - # If a desired response is received, cancel all other pending tasks - for t in pending_tasks: - t.cancel() - return result - else: + try: + result = await task + if isinstance(result, (ModelResponse, CustomStreamWrapper)): + verbose_router_logger.debug( + "Received successful response. Cancelling other LLM API calls." + ) + # If a desired response is received, cancel all other pending tasks + for t in pending_tasks: + t.cancel() + return result + except Exception: + # Ignore exceptions, let the loop handle them + pass + finally: + # Remove the task from pending tasks if it finishes try: pending_tasks.remove(task) - except Exception as e: + except KeyError: pass for model in models: @@ -799,21 +816,22 @@ class Router: model=model, messages=messages, **kwargs ) ) - task.add_done_callback(check_response) - _tasks.append(task) pending_tasks.append(task) - responses = await asyncio.gather(*_tasks, return_exceptions=True) - if isinstance(responses[0], Exception) or isinstance( - responses[0], BaseException - ): - raise responses[0] - _response: Union[ModelResponse, CustomStreamWrapper] = responses[ - 0 - ] # return first value from list + # Await the first task to complete successfully + while pending_tasks: + done, pending_tasks = await asyncio.wait( # type: ignore + pending_tasks, return_when=asyncio.FIRST_COMPLETED + ) + for completed_task in done: + result = await check_response(completed_task) + if result is not None: + # Return the first successful result + result._hidden_params["fastest_response_batch_completion"] = True + return result - _response._hidden_params["fastest_response_batch_completion"] = True - return _response + # If we exit the loop without returning, all tasks failed + raise Exception("All tasks failed") def image_generation(self, prompt: str, model: str, **kwargs): try: @@ -3624,7 +3642,6 @@ class Router: ## get healthy deployments ### get all deployments healthy_deployments = [m for m in self.model_list if m["model_name"] == model] - if len(healthy_deployments) == 0: # check if the user sent in a deployment name instead healthy_deployments = [ diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py index 219881dcb..c74892814 100644 --- a/litellm/tests/test_router_batch_completion.py +++ b/litellm/tests/test_router_batch_completion.py @@ -64,7 +64,7 @@ async def test_batch_completion_multiple_models(mode): from openai.types.chat.chat_completion import ChatCompletion response = await router.abatch_completion_fastest_response( - models=["gpt-3.5-turbo", "groq-llama"], + model="gpt-3.5-turbo, groq-llama", messages=[ {"role": "user", "content": "is litellm becoming a better product ?"} ], @@ -72,3 +72,45 @@ async def test_batch_completion_multiple_models(mode): ) ChatCompletion.model_validate(response.model_dump(), strict=True) + + +@pytest.mark.asyncio +async def test_batch_completion_fastest_response_unit_test(): + """ + Unit test to confirm fastest response will always return the response which arrives earliest. + + 2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned + """ + litellm.set_verbose = True + + router = litellm.Router( + model_list=[ + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4", + }, + "model_info": {"id": "1"}, + }, + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + "mock_response": "This is a fake response", + }, + "model_info": {"id": "2"}, + }, + ] + ) + + response = await router.abatch_completion_fastest_response( + model="gpt-4, gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "is litellm becoming a better product ?"} + ], + max_tokens=500, + ) + + assert response._hidden_params["model_id"] == "2" + assert response.choices[0].message.content == "This is a fake response" + print(f"response: {response}") From f168e356293ea195d1f88bf6c70ec05545cd89c5 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 21:39:09 -0700 Subject: [PATCH 4/7] build(config.yml): add pillow to ci/cd --- .circleci/config.yml | 1 + litellm/main.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 516f2b20d..27f79ed51 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -61,6 +61,7 @@ jobs: pip install prometheus-client==0.20.0 pip install "pydantic==2.7.1" pip install "diskcache==5.6.1" + pip install "Pillow==10.3.0" - save_cache: paths: - ./venv diff --git a/litellm/main.py b/litellm/main.py index cb197aef8..a7fbbfa69 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -14,7 +14,6 @@ from functools import partial import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy import httpx - import litellm from ._logging import verbose_logger from litellm import ( # type: ignore From e3000504f9f61eeae75f2e91804ace9b061647ee Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 21:51:09 -0700 Subject: [PATCH 5/7] fix(router.py): support batch completions fastest response streaming --- litellm/router.py | 6 ++-- litellm/tests/test_router_batch_completion.py | 36 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index 1ed6854cd..3715ec26c 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -771,13 +771,13 @@ class Router: models = [m.strip() for m in model.split(",")] async def _async_completion_no_exceptions( - model: str, messages: List[Dict[str, str]], **kwargs: Any + model: str, messages: List[Dict[str, str]], stream: bool, **kwargs: Any ) -> Union[ModelResponse, CustomStreamWrapper, Exception]: """ Wrapper around self.acompletion that catches exceptions and returns them as a result """ try: - return await self.acompletion(model=model, messages=messages, **kwargs) + return await self.acompletion(model=model, messages=messages, stream=stream, **kwargs) # type: ignore except asyncio.CancelledError: verbose_router_logger.debug( "Received 'task.cancel'. Cancelling call w/ model={}.".format(model) @@ -813,7 +813,7 @@ class Router: for model in models: task = asyncio.create_task( _async_completion_no_exceptions( - model=model, messages=messages, **kwargs + model=model, messages=messages, stream=stream, **kwargs ) ) pending_tasks.append(task) diff --git a/litellm/tests/test_router_batch_completion.py b/litellm/tests/test_router_batch_completion.py index c74892814..82fe102e2 100644 --- a/litellm/tests/test_router_batch_completion.py +++ b/litellm/tests/test_router_batch_completion.py @@ -114,3 +114,39 @@ async def test_batch_completion_fastest_response_unit_test(): assert response._hidden_params["model_id"] == "2" assert response.choices[0].message.content == "This is a fake response" print(f"response: {response}") + + +@pytest.mark.asyncio +async def test_batch_completion_fastest_response_streaming(): + litellm.set_verbose = True + + router = litellm.Router( + model_list=[ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo", + }, + }, + { + "model_name": "groq-llama", + "litellm_params": { + "model": "groq/llama3-8b-8192", + }, + }, + ] + ) + + from openai.types.chat.chat_completion_chunk import ChatCompletionChunk + + response = await router.abatch_completion_fastest_response( + model="gpt-3.5-turbo, groq-llama", + messages=[ + {"role": "user", "content": "is litellm becoming a better product ?"} + ], + max_tokens=15, + stream=True, + ) + + async for chunk in response: + ChatCompletionChunk.model_validate(chunk.model_dump(), strict=True) From 2ee599b848302ad71325985c842bc30c319bfca8 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 22:08:06 -0700 Subject: [PATCH 6/7] docs(batching.md): add batch completion to docs --- docs/my-website/docs/completion/batching.md | 71 ++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md index 09f59f743..313356b7e 100644 --- a/docs/my-website/docs/completion/batching.md +++ b/docs/my-website/docs/completion/batching.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Batching Completion() LiteLLM allows you to: * Send many completion calls to 1 model @@ -51,6 +54,9 @@ This makes parallel calls to the specified `models` and returns the first respon Use this to reduce latency + + + ### Example Code ```python import litellm @@ -68,8 +74,70 @@ response = batch_completion_models( print(result) ``` + + + + + +[how to setup proxy config](../proxy/configs.md) + +Just pass a comma-separated string of model names and the flag `fastest_response=True`. + + + + +```bash + +curl -X POST 'http://localhost:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-D '{ + "model": "gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models + "messages": [ + { + "role": "user", + "content": "What'\''s the weather like in Boston today?" + } + ], + "stream": true, + "fastest_response": true # 👈 FLAG +} + +' +``` + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create( + model="gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ], + extra_body={"fastest_response": true} # 👈 FLAG +) + +print(response) +``` + + + + + + ### Output -Returns the first response +Returns the first response in OpenAI format. Cancels other LLM API calls. ```json { "object": "chat.completion", @@ -95,6 +163,7 @@ Returns the first response } ``` + ## Send 1 completion call to many models: Return All Responses This makes parallel calls to the specified models and returns all responses From 3db30ecb4c18ca9b6434cf62abd41d461ac3c535 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 28 May 2024 22:14:22 -0700 Subject: [PATCH 7/7] docs(batching.md): add batch completion fastest response on proxy to docs --- docs/my-website/docs/completion/batching.md | 31 ++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md index 313356b7e..5854f4db8 100644 --- a/docs/my-website/docs/completion/batching.md +++ b/docs/my-website/docs/completion/batching.md @@ -79,7 +79,7 @@ print(result) -[how to setup proxy config](../proxy/configs.md) +[how to setup proxy config](#example-setup) Just pass a comma-separated string of model names and the flag `fastest_response=True`. @@ -92,11 +92,11 @@ curl -X POST 'http://localhost:4000/chat/completions' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ -D '{ - "model": "gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models + "model": "gpt-4o, groq-llama", # 👈 Comma-separated models "messages": [ { "role": "user", - "content": "What'\''s the weather like in Boston today?" + "content": "What's the weather like in Boston today?" } ], "stream": true, @@ -118,7 +118,7 @@ client = openai.OpenAI( # request sent to model set on litellm proxy, `litellm --model` response = client.chat.completions.create( - model="gpt-3.5-turbo, bedrock-anthropic-claude-3", # 👈 Comma-separated models + model="gpt-4o, groq-llama", # 👈 Comma-separated models messages = [ { "role": "user", @@ -133,6 +133,29 @@ print(response) + +--- + +### Example Setup: + +```yaml +model_list: +- model_name: groq-llama + litellm_params: + model: groq/llama3-8b-8192 + api_key: os.environ/GROQ_API_KEY +- model_name: gpt-4o + litellm_params: + model: gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` +