mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
(docs) add 1k rps load test doc (#6059)
* docs 1k rps load test * docs load testing * docs load testing litellm * docs load testing * clean up load test doc * docs prom metrics for load testing * docs using prometheus on load testing * doc load testing with prometheus
This commit is contained in:
parent
224460d4c9
commit
2449d258cf
9 changed files with 657 additions and 512 deletions
|
@ -1,8 +1,8 @@
|
||||||
import Image from '@theme/IdealImage';
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
# Load Test LiteLLM
|
# LiteLLM Proxy - Locust Load Test
|
||||||
|
|
||||||
## How to run a locust load test on LiteLLM Proxy
|
## Locust Load Test LiteLLM Proxy
|
||||||
|
|
||||||
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
|
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
|
||||||
litellm provides a free hosted `fake-openai-endpoint` you can load test against
|
litellm provides a free hosted `fake-openai-endpoint` you can load test against
|
||||||
|
@ -50,512 +50,3 @@ model_list:
|
||||||
|
|
||||||
<Image img={require('../img/litellm_load_test.png')} />
|
<Image img={require('../img/litellm_load_test.png')} />
|
||||||
|
|
||||||
## Load Test LiteLLM Proxy - 1500+ req/s
|
|
||||||
|
|
||||||
## 1500+ concurrent requests/s
|
|
||||||
|
|
||||||
LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
|
|
||||||
|
|
||||||
```python
|
|
||||||
import time, asyncio
|
|
||||||
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
|
||||||
import uuid
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
# base_url - litellm proxy endpoint
|
|
||||||
# api_key - litellm proxy api-key, is created proxy with auth
|
|
||||||
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
|
||||||
|
|
||||||
|
|
||||||
async def litellm_completion():
|
|
||||||
# Your existing code for litellm_completion goes here
|
|
||||||
try:
|
|
||||||
response = await litellm_client.chat.completions.create(
|
|
||||||
model="azure-gpt-3.5",
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
return response
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# If there's an exception, log the error message
|
|
||||||
with open("error_log.txt", "a") as error_log:
|
|
||||||
error_log.write(f"Error during completion: {str(e)}\n")
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
for i in range(1):
|
|
||||||
start = time.time()
|
|
||||||
n = 1500 # Number of concurrent tasks
|
|
||||||
tasks = [litellm_completion() for _ in range(n)]
|
|
||||||
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
|
||||||
|
|
||||||
successful_completions = [c for c in chat_completions if c is not None]
|
|
||||||
|
|
||||||
# Write errors to error_log.txt
|
|
||||||
with open("error_log.txt", "a") as error_log:
|
|
||||||
for completion in chat_completions:
|
|
||||||
if isinstance(completion, str):
|
|
||||||
error_log.write(completion + "\n")
|
|
||||||
|
|
||||||
print(n, time.time() - start, len(successful_completions))
|
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Blank out contents of error_log.txt
|
|
||||||
open("error_log.txt", "w").close()
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
### Throughput - 30% Increase
|
|
||||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
|
||||||
<Image img={require('../img/throughput.png')} />
|
|
||||||
|
|
||||||
### Latency Added - 0.00325 seconds
|
|
||||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
|
||||||
<Image img={require('../img/latency.png')} />
|
|
||||||
|
|
||||||
|
|
||||||
### Testing LiteLLM Proxy with Locust
|
|
||||||
- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
|
|
||||||
|
|
||||||
<Image img={require('../img/locust.png')} />
|
|
||||||
|
|
||||||
## Load Test LiteLLM SDK vs OpenAI
|
|
||||||
Here is a script to load test LiteLLM vs OpenAI
|
|
||||||
|
|
||||||
```python
|
|
||||||
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
|
||||||
import random, uuid
|
|
||||||
import time, asyncio, litellm
|
|
||||||
# import logging
|
|
||||||
# logging.basicConfig(level=logging.DEBUG)
|
|
||||||
#### LITELLM PROXY ####
|
|
||||||
litellm_client = AsyncOpenAI(
|
|
||||||
api_key="sk-1234", # [CHANGE THIS]
|
|
||||||
base_url="http://0.0.0.0:4000"
|
|
||||||
)
|
|
||||||
|
|
||||||
#### AZURE OPENAI CLIENT ####
|
|
||||||
client = AsyncAzureOpenAI(
|
|
||||||
api_key="my-api-key", # [CHANGE THIS]
|
|
||||||
azure_endpoint="my-api-base", # [CHANGE THIS]
|
|
||||||
api_version="2023-07-01-preview"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
#### LITELLM ROUTER ####
|
|
||||||
model_list = [
|
|
||||||
{
|
|
||||||
"model_name": "azure-canada",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "azure/my-azure-deployment-name", # [CHANGE THIS]
|
|
||||||
"api_key": "my-api-key", # [CHANGE THIS]
|
|
||||||
"api_base": "my-api-base", # [CHANGE THIS]
|
|
||||||
"api_version": "2023-07-01-preview"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
router = litellm.Router(model_list=model_list)
|
|
||||||
|
|
||||||
async def openai_completion():
|
|
||||||
try:
|
|
||||||
response = await client.chat.completions.create(
|
|
||||||
model="gpt-35-turbo",
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
stream=True
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def router_completion():
|
|
||||||
try:
|
|
||||||
response = await router.acompletion(
|
|
||||||
model="azure-canada", # [CHANGE THIS]
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
stream=True
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def proxy_completion_non_streaming():
|
|
||||||
try:
|
|
||||||
response = await litellm_client.chat.completions.create(
|
|
||||||
model="sagemaker-models", # [CHANGE THIS] (if you call it something else on your proxy)
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def loadtest_fn():
|
|
||||||
start = time.time()
|
|
||||||
n = 500 # Number of concurrent tasks
|
|
||||||
tasks = [proxy_completion_non_streaming() for _ in range(n)]
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
|
||||||
successful_completions = [c for c in chat_completions if c is not None]
|
|
||||||
print(n, time.time() - start, len(successful_completions))
|
|
||||||
|
|
||||||
# Run the event loop to execute the async function
|
|
||||||
asyncio.run(loadtest_fn())
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
## Multi-Instance TPM/RPM Load Test (Router)
|
|
||||||
|
|
||||||
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
|
|
||||||
|
|
||||||
In our test:
|
|
||||||
- Max RPM per deployment is = 100 requests per minute
|
|
||||||
- Max Throughput / min on router = 200 requests per minute (2 deployments)
|
|
||||||
- Load we'll send through router = 600 requests per minute
|
|
||||||
|
|
||||||
:::info
|
|
||||||
|
|
||||||
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Code
|
|
||||||
|
|
||||||
Let's hit the router with 600 requests per minute.
|
|
||||||
|
|
||||||
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
|
|
||||||
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
import litellm
|
|
||||||
litellm.suppress_debug_info = True
|
|
||||||
litellm.set_verbose = False
|
|
||||||
import logging
|
|
||||||
logging.basicConfig(level=logging.CRITICAL)
|
|
||||||
import os, random, uuid, time, asyncio
|
|
||||||
|
|
||||||
# Model list for OpenAI and Anthropic models
|
|
||||||
model_list = [
|
|
||||||
{
|
|
||||||
"model_name": "fake-openai-endpoint",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"api_key": "my-fake-key",
|
|
||||||
"api_base": "http://0.0.0.0:8080",
|
|
||||||
"rpm": 100
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model_name": "fake-openai-endpoint",
|
|
||||||
"litellm_params": {
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"api_key": "my-fake-key",
|
|
||||||
"api_base": "http://0.0.0.0:8081",
|
|
||||||
"rpm": 100
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
|
||||||
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def router_completion_non_streaming():
|
|
||||||
try:
|
|
||||||
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
|
|
||||||
# print(f"client={client}")
|
|
||||||
response = await client.acompletion(
|
|
||||||
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def loadtest_fn():
|
|
||||||
start = time.time()
|
|
||||||
n = 600 # Number of concurrent tasks
|
|
||||||
tasks = [router_completion_non_streaming() for _ in range(n)]
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
|
||||||
successful_completions = [c for c in chat_completions if c is not None]
|
|
||||||
print(n, time.time() - start, len(successful_completions))
|
|
||||||
|
|
||||||
def get_utc_datetime():
|
|
||||||
import datetime as dt
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
if hasattr(dt, "UTC"):
|
|
||||||
return datetime.now(dt.UTC) # type: ignore
|
|
||||||
else:
|
|
||||||
return datetime.utcnow() # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
# Run the event loop to execute the async function
|
|
||||||
async def parent_fn():
|
|
||||||
for _ in range(10):
|
|
||||||
dt = get_utc_datetime()
|
|
||||||
current_minute = dt.strftime("%H-%M")
|
|
||||||
print(f"triggered new batch - {current_minute}")
|
|
||||||
await loadtest_fn()
|
|
||||||
await asyncio.sleep(10)
|
|
||||||
|
|
||||||
asyncio.run(parent_fn())
|
|
||||||
```
|
|
||||||
## Multi-Instance TPM/RPM Load Test (Proxy)
|
|
||||||
|
|
||||||
Test if your defined tpm/rpm limits are respected across multiple instances.
|
|
||||||
|
|
||||||
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
|
|
||||||
|
|
||||||
In our test:
|
|
||||||
- Max RPM per deployment is = 100 requests per minute
|
|
||||||
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
|
|
||||||
- Load we'll send to proxy = 600 requests per minute
|
|
||||||
|
|
||||||
|
|
||||||
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
|
|
||||||
|
|
||||||
:::info
|
|
||||||
|
|
||||||
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
### 1. Setup config
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- litellm_params:
|
|
||||||
api_base: http://0.0.0.0:8080
|
|
||||||
api_key: my-fake-key
|
|
||||||
model: openai/my-fake-model
|
|
||||||
rpm: 100
|
|
||||||
model_name: fake-openai-endpoint
|
|
||||||
- litellm_params:
|
|
||||||
api_base: http://0.0.0.0:8081
|
|
||||||
api_key: my-fake-key
|
|
||||||
model: openai/my-fake-model-2
|
|
||||||
rpm: 100
|
|
||||||
model_name: fake-openai-endpoint
|
|
||||||
router_settings:
|
|
||||||
num_retries: 0
|
|
||||||
enable_pre_call_checks: true
|
|
||||||
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
|
|
||||||
redis_password: os.environ/REDIS_PASSWORD
|
|
||||||
redis_port: os.environ/REDIS_PORT
|
|
||||||
routing_strategy: usage-based-routing-v2
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Start proxy 2 instances
|
|
||||||
|
|
||||||
**Instance 1**
|
|
||||||
```bash
|
|
||||||
litellm --config /path/to/config.yaml --port 4000
|
|
||||||
|
|
||||||
## RUNNING on http://0.0.0.0:4000
|
|
||||||
```
|
|
||||||
|
|
||||||
**Instance 2**
|
|
||||||
```bash
|
|
||||||
litellm --config /path/to/config.yaml --port 4001
|
|
||||||
|
|
||||||
## RUNNING on http://0.0.0.0:4001
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Run Test
|
|
||||||
|
|
||||||
Let's hit the proxy with 600 requests per minute.
|
|
||||||
|
|
||||||
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
|
|
||||||
|
|
||||||
```python
|
|
||||||
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
|
||||||
import random, uuid
|
|
||||||
import time, asyncio, litellm
|
|
||||||
# import logging
|
|
||||||
# logging.basicConfig(level=logging.DEBUG)
|
|
||||||
#### LITELLM PROXY ####
|
|
||||||
litellm_client = AsyncOpenAI(
|
|
||||||
api_key="sk-1234", # [CHANGE THIS]
|
|
||||||
base_url="http://0.0.0.0:4000"
|
|
||||||
)
|
|
||||||
litellm_client_2 = AsyncOpenAI(
|
|
||||||
api_key="sk-1234", # [CHANGE THIS]
|
|
||||||
base_url="http://0.0.0.0:4001"
|
|
||||||
)
|
|
||||||
|
|
||||||
async def proxy_completion_non_streaming():
|
|
||||||
try:
|
|
||||||
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
|
|
||||||
# print(f"client={client}")
|
|
||||||
response = await client.chat.completions.create(
|
|
||||||
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def loadtest_fn():
|
|
||||||
start = time.time()
|
|
||||||
n = 600 # Number of concurrent tasks
|
|
||||||
tasks = [proxy_completion_non_streaming() for _ in range(n)]
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
|
||||||
successful_completions = [c for c in chat_completions if c is not None]
|
|
||||||
print(n, time.time() - start, len(successful_completions))
|
|
||||||
|
|
||||||
def get_utc_datetime():
|
|
||||||
import datetime as dt
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
if hasattr(dt, "UTC"):
|
|
||||||
return datetime.now(dt.UTC) # type: ignore
|
|
||||||
else:
|
|
||||||
return datetime.utcnow() # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
# Run the event loop to execute the async function
|
|
||||||
async def parent_fn():
|
|
||||||
for _ in range(10):
|
|
||||||
dt = get_utc_datetime()
|
|
||||||
current_minute = dt.strftime("%H-%M")
|
|
||||||
print(f"triggered new batch - {current_minute}")
|
|
||||||
await loadtest_fn()
|
|
||||||
await asyncio.sleep(10)
|
|
||||||
|
|
||||||
asyncio.run(parent_fn())
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### Extra - Setup Fake OpenAI Server
|
|
||||||
|
|
||||||
Let's setup a fake openai server with a RPM limit of 100.
|
|
||||||
|
|
||||||
Let's call our file `fake_openai_server.py`.
|
|
||||||
|
|
||||||
```
|
|
||||||
# import sys, os
|
|
||||||
# sys.path.insert(
|
|
||||||
# 0, os.path.abspath("../")
|
|
||||||
# ) # Adds the parent directory to the system path
|
|
||||||
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from fastapi.security import OAuth2PasswordBearer
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
|
|
||||||
import httpx, os, json
|
|
||||||
from openai import AsyncOpenAI
|
|
||||||
from typing import Optional
|
|
||||||
from slowapi import Limiter
|
|
||||||
from slowapi.util import get_remote_address
|
|
||||||
from slowapi.errors import RateLimitExceeded
|
|
||||||
from fastapi import FastAPI, Request, HTTPException
|
|
||||||
from fastapi.responses import PlainTextResponse
|
|
||||||
|
|
||||||
|
|
||||||
class ProxyException(Exception):
|
|
||||||
# NOTE: DO NOT MODIFY THIS
|
|
||||||
# This is used to map exactly to OPENAI Exceptions
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
message: str,
|
|
||||||
type: str,
|
|
||||||
param: Optional[str],
|
|
||||||
code: Optional[int],
|
|
||||||
):
|
|
||||||
self.message = message
|
|
||||||
self.type = type
|
|
||||||
self.param = param
|
|
||||||
self.code = code
|
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
"""Converts the ProxyException instance to a dictionary."""
|
|
||||||
return {
|
|
||||||
"message": self.message,
|
|
||||||
"type": self.type,
|
|
||||||
"param": self.param,
|
|
||||||
"code": self.code,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
limiter = Limiter(key_func=get_remote_address)
|
|
||||||
app = FastAPI()
|
|
||||||
app.state.limiter = limiter
|
|
||||||
|
|
||||||
@app.exception_handler(RateLimitExceeded)
|
|
||||||
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
|
||||||
return JSONResponse(status_code=429,
|
|
||||||
content={"detail": "Rate Limited!"})
|
|
||||||
|
|
||||||
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
|
||||||
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"],
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# for completion
|
|
||||||
@app.post("/chat/completions")
|
|
||||||
@app.post("/v1/chat/completions")
|
|
||||||
@limiter.limit("100/minute")
|
|
||||||
async def completion(request: Request):
|
|
||||||
# raise HTTPException(status_code=429, detail="Rate Limited!")
|
|
||||||
return {
|
|
||||||
"id": "chatcmpl-123",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": 1677652288,
|
|
||||||
"model": None,
|
|
||||||
"system_fingerprint": "fp_44709d6fcb",
|
|
||||||
"choices": [{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "\n\nHello there, how may I assist you today?",
|
|
||||||
},
|
|
||||||
"logprobs": None,
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 9,
|
|
||||||
"completion_tokens": 12,
|
|
||||||
"total_tokens": 21
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import socket
|
|
||||||
import uvicorn
|
|
||||||
port = 8080
|
|
||||||
while True:
|
|
||||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
||||||
result = sock.connect_ex(('0.0.0.0', port))
|
|
||||||
if result != 0:
|
|
||||||
print(f"Port {port} is available, starting server...")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
port += 1
|
|
||||||
|
|
||||||
uvicorn.run(app, host="0.0.0.0", port=port)
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 fake_openai_server.py
|
|
||||||
```
|
|
||||||
|
|
209
docs/my-website/docs/load_test_advanced.md
Normal file
209
docs/my-website/docs/load_test_advanced.md
Normal file
|
@ -0,0 +1,209 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
|
|
||||||
|
# LiteLLM Proxy - 1K RPS Load test on locust
|
||||||
|
|
||||||
|
Tutorial on how to get to 1K+ RPS with LiteLLM Proxy on locust
|
||||||
|
|
||||||
|
|
||||||
|
## Pre-Testing Checklist
|
||||||
|
- [ ] Ensure you're using the **latest `-stable` version** of litellm
|
||||||
|
- [Github releases](https://github.com/BerriAI/litellm/releases)
|
||||||
|
- [litellm docker containers](https://github.com/BerriAI/litellm/pkgs/container/litellm)
|
||||||
|
- [litellm database docker container](https://github.com/BerriAI/litellm/pkgs/container/litellm-database)
|
||||||
|
- [ ] Ensure you're following **ALL** [best practices for production](./proxy/production_setup.md)
|
||||||
|
- [ ] Locust - Ensure you're Locust instance can create 1K+ requests per second
|
||||||
|
- 👉 You can use our **[maintained locust instance here](https://locust-load-tester-production.up.railway.app/)**
|
||||||
|
- If you're self hosting locust
|
||||||
|
- [here's the spec used for our locust machine](#machine-specifications-for-running-locust)
|
||||||
|
- [here is the locustfile.py used for our tests](#locust-file-used-for-testing)
|
||||||
|
- [ ] Use this [**machine specification for running litellm proxy**](#machine-specifications-for-running-litellm-proxy)
|
||||||
|
- [ ] **Enterprise LiteLLM** - Use `prometheus` as a callback in your `proxy_config.yaml` to get metrics on your load test
|
||||||
|
Set `litellm_settings.callbacks` to monitor success/failures/all types of errors
|
||||||
|
```yaml
|
||||||
|
litellm_settings:
|
||||||
|
callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Load Test - Fake OpenAI Endpoint
|
||||||
|
|
||||||
|
### Expected Performance
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| Requests per Second | 1174+ |
|
||||||
|
| Median Response Time | `96ms` |
|
||||||
|
| Average Response Time | `142.18ms` |
|
||||||
|
|
||||||
|
### Run Test
|
||||||
|
|
||||||
|
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
|
||||||
|
litellm provides a hosted `fake-openai-endpoint` you can load test against
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
|
||||||
|
```
|
||||||
|
|
||||||
|
2. `pip install locust`
|
||||||
|
|
||||||
|
3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
|
||||||
|
|
||||||
|
4. Start locust
|
||||||
|
Run `locust` in the same directory as your `locustfile.py` from step 2
|
||||||
|
|
||||||
|
```shell
|
||||||
|
locust -f locustfile.py --processes 4
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Run Load test on locust
|
||||||
|
|
||||||
|
Head to the locust UI on http://0.0.0.0:8089
|
||||||
|
|
||||||
|
Set **Users=1000, Ramp Up Users=1000**, Host=Base URL of your LiteLLM Proxy
|
||||||
|
|
||||||
|
6. Expected results
|
||||||
|
|
||||||
|
<Image img={require('../img/locust_load_test1.png')} />
|
||||||
|
|
||||||
|
## Load test - Endpoints with Rate Limits
|
||||||
|
|
||||||
|
Run a load test on 2 LLM deployments each with 10K RPM Quota. Expect to see ~20K RPM (333 RPS)
|
||||||
|
|
||||||
|
### Expected Performance
|
||||||
|
|
||||||
|
- We expect to see 20,000+ successful responses in 1 minute
|
||||||
|
- The remaining requests **fail because the endpoint exceeds it's 10K RPM quota limit - from the LLM API provider**
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| Successful Responses in 1 minute | 20,000+ |
|
||||||
|
| Requests per Second | ~1170+ |
|
||||||
|
| Median Response Time | `70ms` |
|
||||||
|
| Average Response Time | `640.18ms` |
|
||||||
|
|
||||||
|
### Run Test
|
||||||
|
|
||||||
|
1. Add 2 `gemini-vision` deployments on your config.yaml. Each deployment can handle 10K RPM. (We setup a fake endpoint with a rate limit of 1000 RPM on the `/v1/projects/bad-adroit-crow` route below )
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
All requests with `model="gemini-vision"` will be load balanced equally across the 2 deployments.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gemini-vision
|
||||||
|
litellm_params:
|
||||||
|
model: vertex_ai/gemini-1.0-pro-vision-001
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/v1/projects/bad-adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
|
||||||
|
vertex_project: "adroit-crow-413218"
|
||||||
|
vertex_location: "us-central1"
|
||||||
|
vertex_credentials: /etc/secrets/adroit_crow.json
|
||||||
|
- model_name: gemini-vision
|
||||||
|
litellm_params:
|
||||||
|
model: vertex_ai/gemini-1.0-pro-vision-001
|
||||||
|
api_base: https://exampleopenaiendpoint-production-c715.up.railway.app/v1/projects/bad-adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
|
||||||
|
vertex_project: "adroit-crow-413218"
|
||||||
|
vertex_location: "us-central1"
|
||||||
|
vertex_credentials: /etc/secrets/adroit_crow.json
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
|
||||||
|
```
|
||||||
|
|
||||||
|
2. `pip install locust`
|
||||||
|
|
||||||
|
3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
|
||||||
|
|
||||||
|
4. Start locust
|
||||||
|
Run `locust` in the same directory as your `locustfile.py` from step 2
|
||||||
|
|
||||||
|
```shell
|
||||||
|
locust -f locustfile.py --processes 4 -t 60
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Run Load test on locust
|
||||||
|
|
||||||
|
Head to the locust UI on http://0.0.0.0:8089 and use the following settings
|
||||||
|
|
||||||
|
<Image img={require('../img/locust_load_test2_setup.png')} />
|
||||||
|
|
||||||
|
6. Expected results
|
||||||
|
- Successful responses in 1 minute = 19,800 = (69415 - 49615)
|
||||||
|
- Requests per second = 1170
|
||||||
|
- Median response time = 70ms
|
||||||
|
- Average response time = 640ms
|
||||||
|
|
||||||
|
<Image img={require('../img/locust_load_test2.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
## Prometheus Metrics for debugging load tests
|
||||||
|
|
||||||
|
Use the following [prometheus metrics to debug your load tests / failures](./proxy/prometheus)
|
||||||
|
|
||||||
|
| Metric Name | Description |
|
||||||
|
|----------------------|--------------------------------------|
|
||||||
|
| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for a specific LLM deployment. Labels: `"requested_model", "litellm_model_name", "model_id", "api_base", "api_provider", "hashed_api_key", "api_key_alias", "team", "team_alias", "exception_status", "exception_class"` |
|
||||||
|
| `litellm_deployment_cooled_down` | Number of times a deployment has been cooled down by LiteLLM load balancing logic. Labels: `"litellm_model_name", "model_id", "api_base", "api_provider", "exception_status"` |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Machine Specifications for Running Locust
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| `locust --processes 4` | 4|
|
||||||
|
| `vCPUs` on Load Testing Machine | 2.0 vCPUs |
|
||||||
|
| `Memory` on Load Testing Machine | 450 MB |
|
||||||
|
| `Replicas` of Load Testing Machine | 1 |
|
||||||
|
|
||||||
|
## Machine Specifications for Running LiteLLM Proxy
|
||||||
|
|
||||||
|
👉 **Number of Replicas of LiteLLM Proxy=20** for getting 1K+ RPS
|
||||||
|
|
||||||
|
| Service | Spec | CPUs | Memory | Architecture | Version|
|
||||||
|
| --- | --- | --- | --- | --- | --- |
|
||||||
|
| Server | `t2.large`. | `2vCPUs` | `8GB` | `x86` |
|
||||||
|
|
||||||
|
|
||||||
|
## Locust file used for testing
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from locust import HttpUser, task, between
|
||||||
|
|
||||||
|
class MyUser(HttpUser):
|
||||||
|
wait_time = between(0.5, 1) # Random wait time between requests
|
||||||
|
|
||||||
|
@task(100)
|
||||||
|
def litellm_completion(self):
|
||||||
|
# no cache hits with this
|
||||||
|
payload = {
|
||||||
|
"model": "fake-openai-endpoint",
|
||||||
|
"messages": [{"role": "user", "content": f"{uuid.uuid4()} This is a test there will be no cache hits and we'll fill up the context" * 150 }],
|
||||||
|
"user": "my-new-end-user-1"
|
||||||
|
}
|
||||||
|
response = self.client.post("chat/completions", json=payload)
|
||||||
|
if response.status_code != 200:
|
||||||
|
# log the errors in error.txt
|
||||||
|
with open("error.txt", "a") as error_log:
|
||||||
|
error_log.write(response.text + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def on_start(self):
|
||||||
|
self.api_key = os.getenv('API_KEY', 'sk-1234')
|
||||||
|
self.client.headers.update({'Authorization': f'Bearer {self.api_key}'})
|
||||||
|
```
|
348
docs/my-website/docs/load_test_rpm.md
Normal file
348
docs/my-website/docs/load_test_rpm.md
Normal file
|
@ -0,0 +1,348 @@
|
||||||
|
|
||||||
|
|
||||||
|
# Multi-Instance TPM/RPM (litellm.Router)
|
||||||
|
|
||||||
|
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
|
||||||
|
|
||||||
|
In our test:
|
||||||
|
- Max RPM per deployment is = 100 requests per minute
|
||||||
|
- Max Throughput / min on router = 200 requests per minute (2 deployments)
|
||||||
|
- Load we'll send through router = 600 requests per minute
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Code
|
||||||
|
|
||||||
|
Let's hit the router with 600 requests per minute.
|
||||||
|
|
||||||
|
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import litellm
|
||||||
|
litellm.suppress_debug_info = True
|
||||||
|
litellm.set_verbose = False
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
import os, random, uuid, time, asyncio
|
||||||
|
|
||||||
|
# Model list for OpenAI and Anthropic models
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "fake-openai-endpoint",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "my-fake-key",
|
||||||
|
"api_base": "http://0.0.0.0:8080",
|
||||||
|
"rpm": 100
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "fake-openai-endpoint",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "my-fake-key",
|
||||||
|
"api_base": "http://0.0.0.0:8081",
|
||||||
|
"rpm": 100
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
||||||
|
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def router_completion_non_streaming():
|
||||||
|
try:
|
||||||
|
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
|
||||||
|
# print(f"client={client}")
|
||||||
|
response = await client.acompletion(
|
||||||
|
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def loadtest_fn():
|
||||||
|
start = time.time()
|
||||||
|
n = 600 # Number of concurrent tasks
|
||||||
|
tasks = [router_completion_non_streaming() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
|
||||||
|
def get_utc_datetime():
|
||||||
|
import datetime as dt
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if hasattr(dt, "UTC"):
|
||||||
|
return datetime.now(dt.UTC) # type: ignore
|
||||||
|
else:
|
||||||
|
return datetime.utcnow() # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
# Run the event loop to execute the async function
|
||||||
|
async def parent_fn():
|
||||||
|
for _ in range(10):
|
||||||
|
dt = get_utc_datetime()
|
||||||
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
print(f"triggered new batch - {current_minute}")
|
||||||
|
await loadtest_fn()
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
asyncio.run(parent_fn())
|
||||||
|
```
|
||||||
|
## Multi-Instance TPM/RPM Load Test (Proxy)
|
||||||
|
|
||||||
|
Test if your defined tpm/rpm limits are respected across multiple instances.
|
||||||
|
|
||||||
|
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
|
||||||
|
|
||||||
|
In our test:
|
||||||
|
- Max RPM per deployment is = 100 requests per minute
|
||||||
|
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
|
||||||
|
- Load we'll send to proxy = 600 requests per minute
|
||||||
|
|
||||||
|
|
||||||
|
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### 1. Setup config
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- litellm_params:
|
||||||
|
api_base: http://0.0.0.0:8080
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model
|
||||||
|
rpm: 100
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
- litellm_params:
|
||||||
|
api_base: http://0.0.0.0:8081
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model-2
|
||||||
|
rpm: 100
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
router_settings:
|
||||||
|
num_retries: 0
|
||||||
|
enable_pre_call_checks: true
|
||||||
|
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
|
||||||
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
|
redis_port: os.environ/REDIS_PORT
|
||||||
|
routing_strategy: usage-based-routing-v2
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start proxy 2 instances
|
||||||
|
|
||||||
|
**Instance 1**
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml --port 4000
|
||||||
|
|
||||||
|
## RUNNING on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Instance 2**
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml --port 4001
|
||||||
|
|
||||||
|
## RUNNING on http://0.0.0.0:4001
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run Test
|
||||||
|
|
||||||
|
Let's hit the proxy with 600 requests per minute.
|
||||||
|
|
||||||
|
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
|
import random, uuid
|
||||||
|
import time, asyncio, litellm
|
||||||
|
# import logging
|
||||||
|
# logging.basicConfig(level=logging.DEBUG)
|
||||||
|
#### LITELLM PROXY ####
|
||||||
|
litellm_client = AsyncOpenAI(
|
||||||
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
litellm_client_2 = AsyncOpenAI(
|
||||||
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
|
base_url="http://0.0.0.0:4001"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def proxy_completion_non_streaming():
|
||||||
|
try:
|
||||||
|
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
|
||||||
|
# print(f"client={client}")
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def loadtest_fn():
|
||||||
|
start = time.time()
|
||||||
|
n = 600 # Number of concurrent tasks
|
||||||
|
tasks = [proxy_completion_non_streaming() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
|
||||||
|
def get_utc_datetime():
|
||||||
|
import datetime as dt
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if hasattr(dt, "UTC"):
|
||||||
|
return datetime.now(dt.UTC) # type: ignore
|
||||||
|
else:
|
||||||
|
return datetime.utcnow() # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
# Run the event loop to execute the async function
|
||||||
|
async def parent_fn():
|
||||||
|
for _ in range(10):
|
||||||
|
dt = get_utc_datetime()
|
||||||
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
print(f"triggered new batch - {current_minute}")
|
||||||
|
await loadtest_fn()
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
asyncio.run(parent_fn())
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Extra - Setup Fake OpenAI Server
|
||||||
|
|
||||||
|
Let's setup a fake openai server with a RPM limit of 100.
|
||||||
|
|
||||||
|
Let's call our file `fake_openai_server.py`.
|
||||||
|
|
||||||
|
```
|
||||||
|
# import sys, os
|
||||||
|
# sys.path.insert(
|
||||||
|
# 0, os.path.abspath("../")
|
||||||
|
# ) # Adds the parent directory to the system path
|
||||||
|
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
|
||||||
|
import httpx, os, json
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from typing import Optional
|
||||||
|
from slowapi import Limiter
|
||||||
|
from slowapi.util import get_remote_address
|
||||||
|
from slowapi.errors import RateLimitExceeded
|
||||||
|
from fastapi import FastAPI, Request, HTTPException
|
||||||
|
from fastapi.responses import PlainTextResponse
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyException(Exception):
|
||||||
|
# NOTE: DO NOT MODIFY THIS
|
||||||
|
# This is used to map exactly to OPENAI Exceptions
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
type: str,
|
||||||
|
param: Optional[str],
|
||||||
|
code: Optional[int],
|
||||||
|
):
|
||||||
|
self.message = message
|
||||||
|
self.type = type
|
||||||
|
self.param = param
|
||||||
|
self.code = code
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""Converts the ProxyException instance to a dictionary."""
|
||||||
|
return {
|
||||||
|
"message": self.message,
|
||||||
|
"type": self.type,
|
||||||
|
"param": self.param,
|
||||||
|
"code": self.code,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
limiter = Limiter(key_func=get_remote_address)
|
||||||
|
app = FastAPI()
|
||||||
|
app.state.limiter = limiter
|
||||||
|
|
||||||
|
@app.exception_handler(RateLimitExceeded)
|
||||||
|
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
||||||
|
return JSONResponse(status_code=429,
|
||||||
|
content={"detail": "Rate Limited!"})
|
||||||
|
|
||||||
|
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# for completion
|
||||||
|
@app.post("/chat/completions")
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
@limiter.limit("100/minute")
|
||||||
|
async def completion(request: Request):
|
||||||
|
# raise HTTPException(status_code=429, detail="Rate Limited!")
|
||||||
|
return {
|
||||||
|
"id": "chatcmpl-123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1677652288,
|
||||||
|
"model": None,
|
||||||
|
"system_fingerprint": "fp_44709d6fcb",
|
||||||
|
"choices": [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "\n\nHello there, how may I assist you today?",
|
||||||
|
},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop"
|
||||||
|
}],
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 9,
|
||||||
|
"completion_tokens": 12,
|
||||||
|
"total_tokens": 21
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import socket
|
||||||
|
import uvicorn
|
||||||
|
port = 8080
|
||||||
|
while True:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
result = sock.connect_ex(('0.0.0.0', port))
|
||||||
|
if result != 0:
|
||||||
|
print(f"Port {port} is available, starting server...")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
port += 1
|
||||||
|
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 fake_openai_server.py
|
||||||
|
```
|
87
docs/my-website/docs/load_test_sdk.md
Normal file
87
docs/my-website/docs/load_test_sdk.md
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
# LiteLLM SDK vs OpenAI
|
||||||
|
|
||||||
|
Here is a script to load test LiteLLM vs OpenAI
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
|
import random, uuid
|
||||||
|
import time, asyncio, litellm
|
||||||
|
# import logging
|
||||||
|
# logging.basicConfig(level=logging.DEBUG)
|
||||||
|
#### LITELLM PROXY ####
|
||||||
|
litellm_client = AsyncOpenAI(
|
||||||
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
#### AZURE OPENAI CLIENT ####
|
||||||
|
client = AsyncAzureOpenAI(
|
||||||
|
api_key="my-api-key", # [CHANGE THIS]
|
||||||
|
azure_endpoint="my-api-base", # [CHANGE THIS]
|
||||||
|
api_version="2023-07-01-preview"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#### LITELLM ROUTER ####
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure-canada",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/my-azure-deployment-name", # [CHANGE THIS]
|
||||||
|
"api_key": "my-api-key", # [CHANGE THIS]
|
||||||
|
"api_base": "my-api-base", # [CHANGE THIS]
|
||||||
|
"api_version": "2023-07-01-preview"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
router = litellm.Router(model_list=model_list)
|
||||||
|
|
||||||
|
async def openai_completion():
|
||||||
|
try:
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model="gpt-35-turbo",
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def router_completion():
|
||||||
|
try:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure-canada", # [CHANGE THIS]
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def proxy_completion_non_streaming():
|
||||||
|
try:
|
||||||
|
response = await litellm_client.chat.completions.create(
|
||||||
|
model="sagemaker-models", # [CHANGE THIS] (if you call it something else on your proxy)
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def loadtest_fn():
|
||||||
|
start = time.time()
|
||||||
|
n = 500 # Number of concurrent tasks
|
||||||
|
tasks = [proxy_completion_non_streaming() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
|
||||||
|
# Run the event loop to execute the async function
|
||||||
|
asyncio.run(loadtest_fn())
|
||||||
|
|
||||||
|
```
|
|
@ -18,6 +18,7 @@ general_settings:
|
||||||
master_key: sk-1234 # enter your own master key, ensure it starts with 'sk-'
|
master_key: sk-1234 # enter your own master key, ensure it starts with 'sk-'
|
||||||
alerting: ["slack"] # Setup slack alerting - get alerts on LLM exceptions, Budget Alerts, Slow LLM Responses
|
alerting: ["slack"] # Setup slack alerting - get alerts on LLM exceptions, Budget Alerts, Slow LLM Responses
|
||||||
proxy_batch_write_at: 60 # Batch write spend updates every 60s
|
proxy_batch_write_at: 60 # Batch write spend updates every 60s
|
||||||
|
database_connection_pool_limit: 10 # limit the number of database connections to = MAX Number of DB Connections/Number of instances of litellm proxy (Around 10-20 is good number)
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
|
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
|
||||||
|
|
BIN
docs/my-website/img/locust_load_test1.png
Normal file
BIN
docs/my-website/img/locust_load_test1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 211 KiB |
BIN
docs/my-website/img/locust_load_test2.png
Normal file
BIN
docs/my-website/img/locust_load_test2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 212 KiB |
BIN
docs/my-website/img/locust_load_test2_setup.png
Normal file
BIN
docs/my-website/img/locust_load_test2_setup.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 269 KiB |
|
@ -251,7 +251,16 @@ const sidebars = {
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
"load_test",
|
{
|
||||||
|
type: "category",
|
||||||
|
label: "Load Testing",
|
||||||
|
items: [
|
||||||
|
"load_test",
|
||||||
|
"load_test_advanced",
|
||||||
|
"load_test_sdk",
|
||||||
|
"load_test_rpm",
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
label: "Logging & Observability",
|
label: "Logging & Observability",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue