litellm-mirror/litellm/tests/test_least_busy_routing.py

# #### What this tests ####
# #    This tests the router's ability to identify the least busy deployment

# #
# # How is this achieved?
# # - Before each call, have the router print the state of requests {"deployment": "requests_in_flight"}
# # - use litellm.input_callbacks to log when a request is just about to be made to a model - {"deployment-id": traffic}
# # - use litellm.success + failure callbacks to log when a request completed
# # - in get_available_deployment, for a given model group name -> pick based on traffic

# import sys, os, asyncio, time
# import traceback
# from dotenv import load_dotenv

# load_dotenv()
# import os

# sys.path.insert(
#     0, os.path.abspath("../..")
# )  # Adds the parent directory to the system path
# import pytest
# from litellm import Router
# import litellm

# async def test_least_busy_routing():
#     model_list = [{
#         "model_name": "azure-model",
#         "litellm_params": {
#             "model": "azure/gpt-turbo",
#             "api_key": "os.environ/AZURE_FRANCE_API_KEY",
#             "api_base": "https://openai-france-1234.openai.azure.com",
#             "rpm": 1440,
#         }
#     }, {
#         "model_name": "azure-model",
#         "litellm_params": {
#             "model": "azure/gpt-35-turbo",
#             "api_key": "os.environ/AZURE_EUROPE_API_KEY",
#             "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
#             "rpm": 6
#         }
#     }, {
#         "model_name": "azure-model",
#         "litellm_params": {
#             "model": "azure/gpt-35-turbo",
#             "api_key": "os.environ/AZURE_CANADA_API_KEY",
#             "api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
#             "rpm": 6
#         }
#     }]
#     router = Router(model_list=model_list,
# 					routing_strategy="least-busy",
# 					set_verbose=False,
#                     num_retries=3) # type: ignore

#     async def call_azure_completion():
#         try:
#             response = await router.acompletion(
#                 model="azure-model",
#                 messages=[
#                     {
#                         "role": "user",
#                         "content": "hello this request will pass"
#                     }
#                 ]
#             )
#             print("\n response", response)
#             return response
#         except:
#             return None

#     n = 1000
#     start_time = time.time()
#     tasks = [call_azure_completion() for _ in range(n)]
#     chat_completions = await asyncio.gather(*tasks)
#     successful_completions = [c for c in chat_completions if c is not None]
#     print(n, time.time() - start_time, len(successful_completions))

# asyncio.run(test_least_busy_routing())