litellm-mirror/litellm/tests/test_profiling_router.py

#### What this tests ####
#    This profiles a router call to find where calls are taking the most time.

import sys, os, time, logging
import traceback, asyncio, uuid
import pytest
import cProfile
from pstats import Stats
sys.path.insert(
    0, os.path.abspath("../..")
)  # Adds the parent directory to the system path
import litellm
from litellm import Router
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv
from aiodebug import log_slow_callbacks  # Import the aiodebug utility for logging slow callbacks

load_dotenv()

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(levelname)s: %(message)s',
    datefmt='%I:%M:%S %p',
    filename='aiologs.log',   # Name of the log file where logs will be written
    filemode='w'              # 'w' to overwrite the log file on each run, use 'a' to append
)


model_list = [{
    "model_name": "azure-model",
    "litellm_params": {
        "model": "azure/gpt-turbo",
        "api_key": "os.environ/AZURE_FRANCE_API_KEY",
        "api_base": "https://openai-france-1234.openai.azure.com",
        "rpm": 1440,
    }
}, {
    "model_name": "azure-model",
    "litellm_params": {
        "model": "azure/gpt-35-turbo",
        "api_key": "os.environ/AZURE_EUROPE_API_KEY",
        "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
        "rpm": 6
    }
}, {
    "model_name": "azure-model",
    "litellm_params": {
        "model": "azure/gpt-35-turbo",
        "api_key": "os.environ/AZURE_CANADA_API_KEY",
        "api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
        "rpm": 6
    }
}]

router = Router(model_list=model_list, set_verbose=False, num_retries=3)

async def router_completion():
    try:
        messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}]
        response = await router.acompletion(model="azure-model", messages=messages)
        return response
    except asyncio.exceptions.CancelledError:
        print("Task was cancelled")
        return None
    except Exception as e:
        return None

async def loadtest_fn(n = 1000):
    start = time.time()
    tasks = [router_completion() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    print(n, time.time() - start, len(successful_completions))

# loop = asyncio.get_event_loop()
# loop.set_debug(True)
# log_slow_callbacks.enable(0.05)  # Log callbacks slower than 0.05 seconds

# # Excute the load testing function within the asyncio event loop
# loop.run_until_complete(loadtest_fn())

### SUSTAINED LOAD TESTS ###
import time, asyncio
async def make_requests(n):
    tasks = [router_completion() for _ in range(n)]
    print(f"num tasks: {len(tasks)}")
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    print(f"successful_completions: {len(successful_completions)}")
    return successful_completions

async def main():
  start_time = time.time()
  total_successful_requests = 0
  request_limit = 1000
  batches = 2  # batches of 1k requests
  start = time.time()
  tasks = []  # list to hold all tasks

  async def request_loop():
    nonlocal tasks
    for _ in range(batches):
        # Make 1,000 requests
        task = asyncio.create_task(make_requests(request_limit))
        tasks.append(task)

        # Introduce a delay to achieve 1,000 requests per second
        await asyncio.sleep(1)

  await request_loop()
  results = await asyncio.gather(*tasks)
  total_successful_requests = sum(len(res) for res in results)

  print(request_limit*batches, time.time() - start, total_successful_requests)

asyncio.run(main())