(fix) add some better load testing

2024-03-22 19:45:24 -07:00 · 2024-03-22 19:45:24 -07:00 · 311918b99c
commit 311918b99c
parent 48b9250a3d
6 changed files with 270 additions and 14 deletions
--- a/litellm/proxy/proxy_load_test/litellm_router_proxy/Dockerfile
+++ b/litellm/proxy/proxy_load_test/litellm_router_proxy/Dockerfile
@ -0,0 +1,20 @@
 # Use the official Python image as the base image
 FROM python:3.9-slim
 # Set the working directory in the container
 WORKDIR /app
 # Copy the Python requirements file
 COPY requirements.txt .
 # Install the Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the application code
 COPY . .
 # Expose the port the app will run on
 EXPOSE 8090
 # Start the application
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8090"]
--- a/litellm/proxy/proxy_load_test/litellm_router_proxy/main.py
+++ b/litellm/proxy/proxy_load_test/litellm_router_proxy/main.py
@ -0,0 +1,59 @@
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 import uuid
 import litellm
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 litellm_router = litellm.Router(
    model_list=[
        {
            "model_name": "anything",  # model alias -> loadbalance between models with same `model_name`
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "openai/anything",  # actual model name
                "api_key": "sk-1234",
                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
            },
        }
    ]
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
 async def completion(request: Request):
    # this proxy uses the OpenAI SDK to call a fixed endpoint
    response = await litellm_router.acompletion(
        model="anything",
        messages=[
            {
                "role": "user",
                "content": "hello who are you",
            }
        ],
    )
    return response
 if __name__ == "__main__":
    import uvicorn
    # run this on 8090, 8091, 8092 and 8093
    uvicorn.run(app, host="0.0.0.0", port=8090)
--- a/litellm/proxy/proxy_load_test/simple_litellm_proxy.py
+++ b/litellm/proxy/proxy_load_test/simple_litellm_proxy.py
@ -0,0 +1,54 @@
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 import uuid
 import litellm
 import openai
 from openai import AsyncOpenAI
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 litellm_client = AsyncOpenAI(
    base_url="https://exampleopenaiendpoint-production.up.railway.app/",
    api_key="sk-1234",
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
 async def completion(request: Request):
    # this proxy uses the OpenAI SDK to call a fixed endpoint
    response = await litellm.acompletion(
        model="openai/anything",
        messages=[
            {
                "role": "user",
                "content": "hello who are you",
            }
        ],
        client=litellm_client,
    )
    return response
 if __name__ == "__main__":
    import uvicorn
    # run this on 8090, 8091, 8092 and 8093
    uvicorn.run(app, host="0.0.0.0", port=8090)
--- a/litellm/proxy/proxy_load_test/simple_litellm_router_proxy.py
+++ b/litellm/proxy/proxy_load_test/simple_litellm_router_proxy.py
@ -0,0 +1,59 @@
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 import uuid
 import litellm
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 litellm_router = litellm.Router(
    model_list=[
        {
            "model_name": "anything",  # model alias -> loadbalance between models with same `model_name`
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "openai/anything",  # actual model name
                "api_key": "sk-1234",
                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
            },
        }
    ]
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
 async def completion(request: Request):
    # this proxy uses the OpenAI SDK to call a fixed endpoint
    response = await litellm_router.acompletion(
        model="anything",
        messages=[
            {
                "role": "user",
                "content": "hello who are you",
            }
        ],
    )
    return response
 if __name__ == "__main__":
    import uvicorn
    # run this on 8090, 8091, 8092 and 8093
    uvicorn.run(app, host="0.0.0.0", port=8090)
--- a/litellm/proxy/proxy_load_test/simple_proxy.py
+++ b/litellm/proxy/proxy_load_test/simple_proxy.py
@ -0,0 +1,52 @@
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 import uuid
 import openai
 from openai import AsyncOpenAI
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 litellm_client = AsyncOpenAI(
    base_url="https://exampleopenaiendpoint-production.up.railway.app/",
    api_key="sk-1234",
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
 async def completion(request: Request):
    # this proxy uses the OpenAI SDK to call a fixed endpoint
    response = await litellm_client.chat.completions.create(
        model="anything",
        messages=[
            {
                "role": "user",
                "content": "hello who are you",
            }
        ],
    )
    return response
 if __name__ == "__main__":
    import uvicorn
    # run this on 8090, 8091, 8092 and 8093
    uvicorn.run(app, host="0.0.0.0", port=8090)
--- a/litellm/proxy/tests/load_test_completion.py
+++ b/litellm/proxy/tests/load_test_completion.py
@ -1,56 +1,68 @@
-import time, asyncio, os
+import time
 import asyncio
 import os
 from openai import AsyncOpenAI, AsyncAzureOpenAI
 import uuid
 import traceback
 from large_text import text
 from dotenv import load_dotenv
 from statistics import mean, median
-litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
+litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000/", api_key="sk-1234")
 async def litellm_completion():
    # Your existing code for litellm_completion goes here
    try:
        start_time = time.time()
        response = await litellm_client.chat.completions.create(
-            model="fake_openai",
+            model="fake-openai-endpoint",
            messages=[
                {
                    "role": "user",
-                    "content": f"{text}. Who was alexander the great? {uuid.uuid4()}",
+                    "content": f"This is a test{uuid.uuid4()}",
                }
            ],
            user="my-new-end-user-1",
        )
-        return response
+        end_time = time.time()
        latency = end_time - start_time
        print("response time=", latency)
        return response, latency
    except Exception as e:
        # If there's an exception, log the error message
        with open("error_log.txt", "a") as error_log:
            error_log.write(f"Error during completion: {str(e)}\n")
-        pass
+        return None, 0
 async def main():
-    for i in range(3):
+    latencies = []
    for i in range(5):
        start = time.time()
-        n = 10  # Number of concurrent tasks
+        n = 100  # Number of concurrent tasks
        tasks = [litellm_completion() for _ in range(n)]
        chat_completions = await asyncio.gather(*tasks)
-        successful_completions = [c for c in chat_completions if c is not None]
+        successful_completions = [c for c, l in chat_completions if c is not None]
        completion_latencies = [l for c, l in chat_completions if c is not None]
        latencies.extend(completion_latencies)
        # Write errors to error_log.txt
        with open("error_log.txt", "a") as error_log:
-            for completion in chat_completions:
+            for completion, latency in chat_completions:
                if isinstance(completion, str):
                    error_log.write(completion + "\n")
        print(n, time.time() - start, len(successful_completions))
    if latencies:
        average_latency = mean(latencies)
        median_latency = median(latencies)
        print(f"Average Latency per Response: {average_latency} seconds")
        print(f"Median Latency per Response: {median_latency} seconds")
 if __name__ == "__main__":
    # Blank out contents of error_log.txt
    open("error_log.txt", "w").close()
    asyncio.run(main())