#### What this tests #### # This tests litellm router with batch completion import sys, os, time, openai import traceback, asyncio import pytest sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import litellm from litellm import Router from litellm.router import Deployment, LiteLLM_Params, ModelInfo from concurrent.futures import ThreadPoolExecutor from collections import defaultdict from dotenv import load_dotenv import os, httpx load_dotenv() @pytest.mark.parametrize("mode", ["all_responses", "fastest_response"]) @pytest.mark.asyncio async def test_batch_completion_multiple_models(mode): litellm.set_verbose = True router = litellm.Router( model_list=[ { "model_name": "gpt-3.5-turbo", "litellm_params": { "model": "gpt-3.5-turbo", }, }, { "model_name": "groq-llama", "litellm_params": { "model": "groq/llama3-8b-8192", }, }, ] ) if mode == "all_responses": response = await router.abatch_completion( models=["gpt-3.5-turbo", "groq-llama"], messages=[ {"role": "user", "content": "is litellm becoming a better product ?"} ], max_tokens=15, ) print(response) assert len(response) == 2 models_in_responses = [] for individual_response in response: _model = individual_response["model"] models_in_responses.append(_model) # assert both models are different assert models_in_responses[0] != models_in_responses[1] elif mode == "fastest_response": from openai.types.chat.chat_completion import ChatCompletion response = await router.abatch_completion_fastest_response( model="gpt-3.5-turbo, groq-llama", messages=[ {"role": "user", "content": "is litellm becoming a better product ?"} ], max_tokens=15, ) ChatCompletion.model_validate(response.model_dump(), strict=True) @pytest.mark.asyncio async def test_batch_completion_fastest_response_unit_test(): """ Unit test to confirm fastest response will always return the response which arrives earliest. 2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned """ litellm.set_verbose = True router = litellm.Router( model_list=[ { "model_name": "gpt-4", "litellm_params": { "model": "gpt-4", }, "model_info": {"id": "1"}, }, { "model_name": "gpt-3.5-turbo", "litellm_params": { "model": "gpt-3.5-turbo", "mock_response": "This is a fake response", }, "model_info": {"id": "2"}, }, ] ) response = await router.abatch_completion_fastest_response( model="gpt-4, gpt-3.5-turbo", messages=[ {"role": "user", "content": "is litellm becoming a better product ?"} ], max_tokens=500, ) assert response._hidden_params["model_id"] == "2" assert response.choices[0].message.content == "This is a fake response" print(f"response: {response}")