diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md
index ce528746d..4641a7036 100644
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@@ -1,8 +1,8 @@
import Image from '@theme/IdealImage';
-# Load Test LiteLLM
+# LiteLLM Proxy - Locust Load Test
-## How to run a locust load test on LiteLLM Proxy
+## Locust Load Test LiteLLM Proxy
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
litellm provides a free hosted `fake-openai-endpoint` you can load test against
@@ -50,512 +50,3 @@ model_list:
-## Load Test LiteLLM Proxy - 1500+ req/s
-
-## 1500+ concurrent requests/s
-
-LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
-
-```python
-import time, asyncio
-from openai import AsyncOpenAI, AsyncAzureOpenAI
-import uuid
-import traceback
-
-# base_url - litellm proxy endpoint
-# api_key - litellm proxy api-key, is created proxy with auth
-litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
-
-
-async def litellm_completion():
- # Your existing code for litellm_completion goes here
- try:
- response = await litellm_client.chat.completions.create(
- model="azure-gpt-3.5",
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- )
- print(response)
- return response
-
- except Exception as e:
- # If there's an exception, log the error message
- with open("error_log.txt", "a") as error_log:
- error_log.write(f"Error during completion: {str(e)}\n")
- pass
-
-
-async def main():
- for i in range(1):
- start = time.time()
- n = 1500 # Number of concurrent tasks
- tasks = [litellm_completion() for _ in range(n)]
-
- chat_completions = await asyncio.gather(*tasks)
-
- successful_completions = [c for c in chat_completions if c is not None]
-
- # Write errors to error_log.txt
- with open("error_log.txt", "a") as error_log:
- for completion in chat_completions:
- if isinstance(completion, str):
- error_log.write(completion + "\n")
-
- print(n, time.time() - start, len(successful_completions))
- time.sleep(10)
-
-
-if __name__ == "__main__":
- # Blank out contents of error_log.txt
- open("error_log.txt", "w").close()
-
- asyncio.run(main())
-
-```
-
-### Throughput - 30% Increase
-LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
-
-
-### Latency Added - 0.00325 seconds
-LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
-
-
-
-### Testing LiteLLM Proxy with Locust
-- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
-
-
-
-## Load Test LiteLLM SDK vs OpenAI
-Here is a script to load test LiteLLM vs OpenAI
-
-```python
-from openai import AsyncOpenAI, AsyncAzureOpenAI
-import random, uuid
-import time, asyncio, litellm
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-#### LITELLM PROXY ####
-litellm_client = AsyncOpenAI(
- api_key="sk-1234", # [CHANGE THIS]
- base_url="http://0.0.0.0:4000"
-)
-
-#### AZURE OPENAI CLIENT ####
-client = AsyncAzureOpenAI(
- api_key="my-api-key", # [CHANGE THIS]
- azure_endpoint="my-api-base", # [CHANGE THIS]
- api_version="2023-07-01-preview"
-)
-
-
-#### LITELLM ROUTER ####
-model_list = [
- {
- "model_name": "azure-canada",
- "litellm_params": {
- "model": "azure/my-azure-deployment-name", # [CHANGE THIS]
- "api_key": "my-api-key", # [CHANGE THIS]
- "api_base": "my-api-base", # [CHANGE THIS]
- "api_version": "2023-07-01-preview"
- }
- }
-]
-
-router = litellm.Router(model_list=model_list)
-
-async def openai_completion():
- try:
- response = await client.chat.completions.create(
- model="gpt-35-turbo",
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- stream=True
- )
- return response
- except Exception as e:
- print(e)
- return None
-
-
-async def router_completion():
- try:
- response = await router.acompletion(
- model="azure-canada", # [CHANGE THIS]
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- stream=True
- )
- return response
- except Exception as e:
- print(e)
- return None
-
-async def proxy_completion_non_streaming():
- try:
- response = await litellm_client.chat.completions.create(
- model="sagemaker-models", # [CHANGE THIS] (if you call it something else on your proxy)
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- )
- return response
- except Exception as e:
- print(e)
- return None
-
-async def loadtest_fn():
- start = time.time()
- n = 500 # Number of concurrent tasks
- tasks = [proxy_completion_non_streaming() for _ in range(n)]
- chat_completions = await asyncio.gather(*tasks)
- successful_completions = [c for c in chat_completions if c is not None]
- print(n, time.time() - start, len(successful_completions))
-
-# Run the event loop to execute the async function
-asyncio.run(loadtest_fn())
-
-```
-
-## Multi-Instance TPM/RPM Load Test (Router)
-
-Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
-
-In our test:
-- Max RPM per deployment is = 100 requests per minute
-- Max Throughput / min on router = 200 requests per minute (2 deployments)
-- Load we'll send through router = 600 requests per minute
-
-:::info
-
-If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
-
-:::
-
-### Code
-
-Let's hit the router with 600 requests per minute.
-
-Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
-
-
-```python
-from litellm import Router
-import litellm
-litellm.suppress_debug_info = True
-litellm.set_verbose = False
-import logging
-logging.basicConfig(level=logging.CRITICAL)
-import os, random, uuid, time, asyncio
-
-# Model list for OpenAI and Anthropic models
-model_list = [
- {
- "model_name": "fake-openai-endpoint",
- "litellm_params": {
- "model": "gpt-3.5-turbo",
- "api_key": "my-fake-key",
- "api_base": "http://0.0.0.0:8080",
- "rpm": 100
- },
- },
- {
- "model_name": "fake-openai-endpoint",
- "litellm_params": {
- "model": "gpt-3.5-turbo",
- "api_key": "my-fake-key",
- "api_base": "http://0.0.0.0:8081",
- "rpm": 100
- },
- },
-]
-
-router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
-router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
-
-
-
-async def router_completion_non_streaming():
- try:
- client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
- # print(f"client={client}")
- response = await client.acompletion(
- model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- )
- return response
- except Exception as e:
- # print(e)
- return None
-
-async def loadtest_fn():
- start = time.time()
- n = 600 # Number of concurrent tasks
- tasks = [router_completion_non_streaming() for _ in range(n)]
- chat_completions = await asyncio.gather(*tasks)
- successful_completions = [c for c in chat_completions if c is not None]
- print(n, time.time() - start, len(successful_completions))
-
-def get_utc_datetime():
- import datetime as dt
- from datetime import datetime
-
- if hasattr(dt, "UTC"):
- return datetime.now(dt.UTC) # type: ignore
- else:
- return datetime.utcnow() # type: ignore
-
-
-# Run the event loop to execute the async function
-async def parent_fn():
- for _ in range(10):
- dt = get_utc_datetime()
- current_minute = dt.strftime("%H-%M")
- print(f"triggered new batch - {current_minute}")
- await loadtest_fn()
- await asyncio.sleep(10)
-
-asyncio.run(parent_fn())
-```
-## Multi-Instance TPM/RPM Load Test (Proxy)
-
-Test if your defined tpm/rpm limits are respected across multiple instances.
-
-The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
-
-In our test:
-- Max RPM per deployment is = 100 requests per minute
-- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
-- Load we'll send to proxy = 600 requests per minute
-
-
-So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
-
-:::info
-
-If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
-
-:::
-
-### 1. Setup config
-
-```yaml
-model_list:
-- litellm_params:
- api_base: http://0.0.0.0:8080
- api_key: my-fake-key
- model: openai/my-fake-model
- rpm: 100
- model_name: fake-openai-endpoint
-- litellm_params:
- api_base: http://0.0.0.0:8081
- api_key: my-fake-key
- model: openai/my-fake-model-2
- rpm: 100
- model_name: fake-openai-endpoint
-router_settings:
- num_retries: 0
- enable_pre_call_checks: true
- redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
- redis_password: os.environ/REDIS_PASSWORD
- redis_port: os.environ/REDIS_PORT
- routing_strategy: usage-based-routing-v2
-```
-
-### 2. Start proxy 2 instances
-
-**Instance 1**
-```bash
-litellm --config /path/to/config.yaml --port 4000
-
-## RUNNING on http://0.0.0.0:4000
-```
-
-**Instance 2**
-```bash
-litellm --config /path/to/config.yaml --port 4001
-
-## RUNNING on http://0.0.0.0:4001
-```
-
-### 3. Run Test
-
-Let's hit the proxy with 600 requests per minute.
-
-Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
-
-```python
-from openai import AsyncOpenAI, AsyncAzureOpenAI
-import random, uuid
-import time, asyncio, litellm
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-#### LITELLM PROXY ####
-litellm_client = AsyncOpenAI(
- api_key="sk-1234", # [CHANGE THIS]
- base_url="http://0.0.0.0:4000"
-)
-litellm_client_2 = AsyncOpenAI(
- api_key="sk-1234", # [CHANGE THIS]
- base_url="http://0.0.0.0:4001"
-)
-
-async def proxy_completion_non_streaming():
- try:
- client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
- # print(f"client={client}")
- response = await client.chat.completions.create(
- model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
- messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
- )
- return response
- except Exception as e:
- # print(e)
- return None
-
-async def loadtest_fn():
- start = time.time()
- n = 600 # Number of concurrent tasks
- tasks = [proxy_completion_non_streaming() for _ in range(n)]
- chat_completions = await asyncio.gather(*tasks)
- successful_completions = [c for c in chat_completions if c is not None]
- print(n, time.time() - start, len(successful_completions))
-
-def get_utc_datetime():
- import datetime as dt
- from datetime import datetime
-
- if hasattr(dt, "UTC"):
- return datetime.now(dt.UTC) # type: ignore
- else:
- return datetime.utcnow() # type: ignore
-
-
-# Run the event loop to execute the async function
-async def parent_fn():
- for _ in range(10):
- dt = get_utc_datetime()
- current_minute = dt.strftime("%H-%M")
- print(f"triggered new batch - {current_minute}")
- await loadtest_fn()
- await asyncio.sleep(10)
-
-asyncio.run(parent_fn())
-
-```
-
-
-### Extra - Setup Fake OpenAI Server
-
-Let's setup a fake openai server with a RPM limit of 100.
-
-Let's call our file `fake_openai_server.py`.
-
-```
-# import sys, os
-# sys.path.insert(
-# 0, os.path.abspath("../")
-# ) # Adds the parent directory to the system path
-from fastapi import FastAPI, Request, status, HTTPException, Depends
-from fastapi.responses import StreamingResponse
-from fastapi.security import OAuth2PasswordBearer
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from fastapi import FastAPI, Request, HTTPException, UploadFile, File
-import httpx, os, json
-from openai import AsyncOpenAI
-from typing import Optional
-from slowapi import Limiter
-from slowapi.util import get_remote_address
-from slowapi.errors import RateLimitExceeded
-from fastapi import FastAPI, Request, HTTPException
-from fastapi.responses import PlainTextResponse
-
-
-class ProxyException(Exception):
- # NOTE: DO NOT MODIFY THIS
- # This is used to map exactly to OPENAI Exceptions
- def __init__(
- self,
- message: str,
- type: str,
- param: Optional[str],
- code: Optional[int],
- ):
- self.message = message
- self.type = type
- self.param = param
- self.code = code
-
- def to_dict(self) -> dict:
- """Converts the ProxyException instance to a dictionary."""
- return {
- "message": self.message,
- "type": self.type,
- "param": self.param,
- "code": self.code,
- }
-
-
-limiter = Limiter(key_func=get_remote_address)
-app = FastAPI()
-app.state.limiter = limiter
-
-@app.exception_handler(RateLimitExceeded)
-async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
- return JSONResponse(status_code=429,
- content={"detail": "Rate Limited!"})
-
-app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
-
-app.add_middleware(
- CORSMiddleware,
- allow_origins=["*"],
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"],
-)
-
-# for completion
-@app.post("/chat/completions")
-@app.post("/v1/chat/completions")
-@limiter.limit("100/minute")
-async def completion(request: Request):
- # raise HTTPException(status_code=429, detail="Rate Limited!")
- return {
- "id": "chatcmpl-123",
- "object": "chat.completion",
- "created": 1677652288,
- "model": None,
- "system_fingerprint": "fp_44709d6fcb",
- "choices": [{
- "index": 0,
- "message": {
- "role": "assistant",
- "content": "\n\nHello there, how may I assist you today?",
- },
- "logprobs": None,
- "finish_reason": "stop"
- }],
- "usage": {
- "prompt_tokens": 9,
- "completion_tokens": 12,
- "total_tokens": 21
- }
- }
-
-if __name__ == "__main__":
- import socket
- import uvicorn
- port = 8080
- while True:
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- result = sock.connect_ex(('0.0.0.0', port))
- if result != 0:
- print(f"Port {port} is available, starting server...")
- break
- else:
- port += 1
-
- uvicorn.run(app, host="0.0.0.0", port=port)
-```
-
-```bash
-python3 fake_openai_server.py
-```
diff --git a/docs/my-website/docs/load_test_advanced.md b/docs/my-website/docs/load_test_advanced.md
new file mode 100644
index 000000000..a73325014
--- /dev/null
+++ b/docs/my-website/docs/load_test_advanced.md
@@ -0,0 +1,209 @@
+import Image from '@theme/IdealImage';
+
+
+# LiteLLM Proxy - 1K RPS Load test on locust
+
+Tutorial on how to get to 1K+ RPS with LiteLLM Proxy on locust
+
+
+## Pre-Testing Checklist
+- [ ] Ensure you're using the **latest `-stable` version** of litellm
+ - [Github releases](https://github.com/BerriAI/litellm/releases)
+ - [litellm docker containers](https://github.com/BerriAI/litellm/pkgs/container/litellm)
+ - [litellm database docker container](https://github.com/BerriAI/litellm/pkgs/container/litellm-database)
+- [ ] Ensure you're following **ALL** [best practices for production](./proxy/production_setup.md)
+- [ ] Locust - Ensure you're Locust instance can create 1K+ requests per second
+ - 👉 You can use our **[maintained locust instance here](https://locust-load-tester-production.up.railway.app/)**
+ - If you're self hosting locust
+ - [here's the spec used for our locust machine](#machine-specifications-for-running-locust)
+ - [here is the locustfile.py used for our tests](#locust-file-used-for-testing)
+- [ ] Use this [**machine specification for running litellm proxy**](#machine-specifications-for-running-litellm-proxy)
+- [ ] **Enterprise LiteLLM** - Use `prometheus` as a callback in your `proxy_config.yaml` to get metrics on your load test
+ Set `litellm_settings.callbacks` to monitor success/failures/all types of errors
+ ```yaml
+ litellm_settings:
+ callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
+ ```
+
+
+
+## Load Test - Fake OpenAI Endpoint
+
+### Expected Performance
+
+| Metric | Value |
+|--------|-------|
+| Requests per Second | 1174+ |
+| Median Response Time | `96ms` |
+| Average Response Time | `142.18ms` |
+
+### Run Test
+
+1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
+litellm provides a hosted `fake-openai-endpoint` you can load test against
+
+```yaml
+model_list:
+ - model_name: fake-openai-endpoint
+ litellm_params:
+ model: openai/fake
+ api_key: fake-key
+ api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+ callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
+```
+
+2. `pip install locust`
+
+3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
+
+4. Start locust
+ Run `locust` in the same directory as your `locustfile.py` from step 2
+
+ ```shell
+ locust -f locustfile.py --processes 4
+ ```
+
+5. Run Load test on locust
+
+ Head to the locust UI on http://0.0.0.0:8089
+
+ Set **Users=1000, Ramp Up Users=1000**, Host=Base URL of your LiteLLM Proxy
+
+6. Expected results
+
+
+
+## Load test - Endpoints with Rate Limits
+
+Run a load test on 2 LLM deployments each with 10K RPM Quota. Expect to see ~20K RPM (333 RPS)
+
+### Expected Performance
+
+- We expect to see 20,000+ successful responses in 1 minute
+- The remaining requests **fail because the endpoint exceeds it's 10K RPM quota limit - from the LLM API provider**
+
+| Metric | Value |
+|--------|-------|
+| Successful Responses in 1 minute | 20,000+ |
+| Requests per Second | ~1170+ |
+| Median Response Time | `70ms` |
+| Average Response Time | `640.18ms` |
+
+### Run Test
+
+1. Add 2 `gemini-vision` deployments on your config.yaml. Each deployment can handle 10K RPM. (We setup a fake endpoint with a rate limit of 1000 RPM on the `/v1/projects/bad-adroit-crow` route below )
+
+:::info
+
+All requests with `model="gemini-vision"` will be load balanced equally across the 2 deployments.
+
+:::
+
+```yaml
+model_list:
+ - model_name: gemini-vision
+ litellm_params:
+ model: vertex_ai/gemini-1.0-pro-vision-001
+ api_base: https://exampleopenaiendpoint-production.up.railway.app/v1/projects/bad-adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
+ vertex_project: "adroit-crow-413218"
+ vertex_location: "us-central1"
+ vertex_credentials: /etc/secrets/adroit_crow.json
+ - model_name: gemini-vision
+ litellm_params:
+ model: vertex_ai/gemini-1.0-pro-vision-001
+ api_base: https://exampleopenaiendpoint-production-c715.up.railway.app/v1/projects/bad-adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
+ vertex_project: "adroit-crow-413218"
+ vertex_location: "us-central1"
+ vertex_credentials: /etc/secrets/adroit_crow.json
+
+litellm_settings:
+ callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
+```
+
+2. `pip install locust`
+
+3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
+
+4. Start locust
+ Run `locust` in the same directory as your `locustfile.py` from step 2
+
+ ```shell
+ locust -f locustfile.py --processes 4 -t 60
+ ```
+
+5. Run Load test on locust
+
+ Head to the locust UI on http://0.0.0.0:8089 and use the following settings
+
+
+
+6. Expected results
+ - Successful responses in 1 minute = 19,800 = (69415 - 49615)
+ - Requests per second = 1170
+ - Median response time = 70ms
+ - Average response time = 640ms
+
+
+
+
+## Prometheus Metrics for debugging load tests
+
+Use the following [prometheus metrics to debug your load tests / failures](./proxy/prometheus)
+
+| Metric Name | Description |
+|----------------------|--------------------------------------|
+| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for a specific LLM deployment. Labels: `"requested_model", "litellm_model_name", "model_id", "api_base", "api_provider", "hashed_api_key", "api_key_alias", "team", "team_alias", "exception_status", "exception_class"` |
+| `litellm_deployment_cooled_down` | Number of times a deployment has been cooled down by LiteLLM load balancing logic. Labels: `"litellm_model_name", "model_id", "api_base", "api_provider", "exception_status"` |
+
+
+
+## Machine Specifications for Running Locust
+
+| Metric | Value |
+|--------|-------|
+| `locust --processes 4` | 4|
+| `vCPUs` on Load Testing Machine | 2.0 vCPUs |
+| `Memory` on Load Testing Machine | 450 MB |
+| `Replicas` of Load Testing Machine | 1 |
+
+## Machine Specifications for Running LiteLLM Proxy
+
+👉 **Number of Replicas of LiteLLM Proxy=20** for getting 1K+ RPS
+
+| Service | Spec | CPUs | Memory | Architecture | Version|
+| --- | --- | --- | --- | --- | --- |
+| Server | `t2.large`. | `2vCPUs` | `8GB` | `x86` |
+
+
+## Locust file used for testing
+
+```python
+import os
+import uuid
+from locust import HttpUser, task, between
+
+class MyUser(HttpUser):
+ wait_time = between(0.5, 1) # Random wait time between requests
+
+ @task(100)
+ def litellm_completion(self):
+ # no cache hits with this
+ payload = {
+ "model": "fake-openai-endpoint",
+ "messages": [{"role": "user", "content": f"{uuid.uuid4()} This is a test there will be no cache hits and we'll fill up the context" * 150 }],
+ "user": "my-new-end-user-1"
+ }
+ response = self.client.post("chat/completions", json=payload)
+ if response.status_code != 200:
+ # log the errors in error.txt
+ with open("error.txt", "a") as error_log:
+ error_log.write(response.text + "\n")
+
+
+
+ def on_start(self):
+ self.api_key = os.getenv('API_KEY', 'sk-1234')
+ self.client.headers.update({'Authorization': f'Bearer {self.api_key}'})
+```
\ No newline at end of file
diff --git a/docs/my-website/docs/load_test_rpm.md b/docs/my-website/docs/load_test_rpm.md
new file mode 100644
index 000000000..0954ffcdf
--- /dev/null
+++ b/docs/my-website/docs/load_test_rpm.md
@@ -0,0 +1,348 @@
+
+
+# Multi-Instance TPM/RPM (litellm.Router)
+
+Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on router = 200 requests per minute (2 deployments)
+- Load we'll send through router = 600 requests per minute
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### Code
+
+Let's hit the router with 600 requests per minute.
+
+Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
+
+
+```python
+from litellm import Router
+import litellm
+litellm.suppress_debug_info = True
+litellm.set_verbose = False
+import logging
+logging.basicConfig(level=logging.CRITICAL)
+import os, random, uuid, time, asyncio
+
+# Model list for OpenAI and Anthropic models
+model_list = [
+ {
+ "model_name": "fake-openai-endpoint",
+ "litellm_params": {
+ "model": "gpt-3.5-turbo",
+ "api_key": "my-fake-key",
+ "api_base": "http://0.0.0.0:8080",
+ "rpm": 100
+ },
+ },
+ {
+ "model_name": "fake-openai-endpoint",
+ "litellm_params": {
+ "model": "gpt-3.5-turbo",
+ "api_key": "my-fake-key",
+ "api_base": "http://0.0.0.0:8081",
+ "rpm": 100
+ },
+ },
+]
+
+router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+
+
+
+async def router_completion_non_streaming():
+ try:
+ client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
+ # print(f"client={client}")
+ response = await client.acompletion(
+ model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ )
+ return response
+ except Exception as e:
+ # print(e)
+ return None
+
+async def loadtest_fn():
+ start = time.time()
+ n = 600 # Number of concurrent tasks
+ tasks = [router_completion_non_streaming() for _ in range(n)]
+ chat_completions = await asyncio.gather(*tasks)
+ successful_completions = [c for c in chat_completions if c is not None]
+ print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+ import datetime as dt
+ from datetime import datetime
+
+ if hasattr(dt, "UTC"):
+ return datetime.now(dt.UTC) # type: ignore
+ else:
+ return datetime.utcnow() # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+ for _ in range(10):
+ dt = get_utc_datetime()
+ current_minute = dt.strftime("%H-%M")
+ print(f"triggered new batch - {current_minute}")
+ await loadtest_fn()
+ await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+```
+## Multi-Instance TPM/RPM Load Test (Proxy)
+
+Test if your defined tpm/rpm limits are respected across multiple instances.
+
+The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
+- Load we'll send to proxy = 600 requests per minute
+
+
+So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### 1. Setup config
+
+```yaml
+model_list:
+- litellm_params:
+ api_base: http://0.0.0.0:8080
+ api_key: my-fake-key
+ model: openai/my-fake-model
+ rpm: 100
+ model_name: fake-openai-endpoint
+- litellm_params:
+ api_base: http://0.0.0.0:8081
+ api_key: my-fake-key
+ model: openai/my-fake-model-2
+ rpm: 100
+ model_name: fake-openai-endpoint
+router_settings:
+ num_retries: 0
+ enable_pre_call_checks: true
+ redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
+ redis_password: os.environ/REDIS_PASSWORD
+ redis_port: os.environ/REDIS_PORT
+ routing_strategy: usage-based-routing-v2
+```
+
+### 2. Start proxy 2 instances
+
+**Instance 1**
+```bash
+litellm --config /path/to/config.yaml --port 4000
+
+## RUNNING on http://0.0.0.0:4000
+```
+
+**Instance 2**
+```bash
+litellm --config /path/to/config.yaml --port 4001
+
+## RUNNING on http://0.0.0.0:4001
+```
+
+### 3. Run Test
+
+Let's hit the proxy with 600 requests per minute.
+
+Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
+
+```python
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import random, uuid
+import time, asyncio, litellm
+# import logging
+# logging.basicConfig(level=logging.DEBUG)
+#### LITELLM PROXY ####
+litellm_client = AsyncOpenAI(
+ api_key="sk-1234", # [CHANGE THIS]
+ base_url="http://0.0.0.0:4000"
+)
+litellm_client_2 = AsyncOpenAI(
+ api_key="sk-1234", # [CHANGE THIS]
+ base_url="http://0.0.0.0:4001"
+)
+
+async def proxy_completion_non_streaming():
+ try:
+ client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
+ # print(f"client={client}")
+ response = await client.chat.completions.create(
+ model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ )
+ return response
+ except Exception as e:
+ # print(e)
+ return None
+
+async def loadtest_fn():
+ start = time.time()
+ n = 600 # Number of concurrent tasks
+ tasks = [proxy_completion_non_streaming() for _ in range(n)]
+ chat_completions = await asyncio.gather(*tasks)
+ successful_completions = [c for c in chat_completions if c is not None]
+ print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+ import datetime as dt
+ from datetime import datetime
+
+ if hasattr(dt, "UTC"):
+ return datetime.now(dt.UTC) # type: ignore
+ else:
+ return datetime.utcnow() # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+ for _ in range(10):
+ dt = get_utc_datetime()
+ current_minute = dt.strftime("%H-%M")
+ print(f"triggered new batch - {current_minute}")
+ await loadtest_fn()
+ await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+
+```
+
+
+### Extra - Setup Fake OpenAI Server
+
+Let's setup a fake openai server with a RPM limit of 100.
+
+Let's call our file `fake_openai_server.py`.
+
+```
+# import sys, os
+# sys.path.insert(
+# 0, os.path.abspath("../")
+# ) # Adds the parent directory to the system path
+from fastapi import FastAPI, Request, status, HTTPException, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi import FastAPI, Request, HTTPException, UploadFile, File
+import httpx, os, json
+from openai import AsyncOpenAI
+from typing import Optional
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import PlainTextResponse
+
+
+class ProxyException(Exception):
+ # NOTE: DO NOT MODIFY THIS
+ # This is used to map exactly to OPENAI Exceptions
+ def __init__(
+ self,
+ message: str,
+ type: str,
+ param: Optional[str],
+ code: Optional[int],
+ ):
+ self.message = message
+ self.type = type
+ self.param = param
+ self.code = code
+
+ def to_dict(self) -> dict:
+ """Converts the ProxyException instance to a dictionary."""
+ return {
+ "message": self.message,
+ "type": self.type,
+ "param": self.param,
+ "code": self.code,
+ }
+
+
+limiter = Limiter(key_func=get_remote_address)
+app = FastAPI()
+app.state.limiter = limiter
+
+@app.exception_handler(RateLimitExceeded)
+async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+ return JSONResponse(status_code=429,
+ content={"detail": "Rate Limited!"})
+
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+# for completion
+@app.post("/chat/completions")
+@app.post("/v1/chat/completions")
+@limiter.limit("100/minute")
+async def completion(request: Request):
+ # raise HTTPException(status_code=429, detail="Rate Limited!")
+ return {
+ "id": "chatcmpl-123",
+ "object": "chat.completion",
+ "created": 1677652288,
+ "model": None,
+ "system_fingerprint": "fp_44709d6fcb",
+ "choices": [{
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "\n\nHello there, how may I assist you today?",
+ },
+ "logprobs": None,
+ "finish_reason": "stop"
+ }],
+ "usage": {
+ "prompt_tokens": 9,
+ "completion_tokens": 12,
+ "total_tokens": 21
+ }
+ }
+
+if __name__ == "__main__":
+ import socket
+ import uvicorn
+ port = 8080
+ while True:
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ result = sock.connect_ex(('0.0.0.0', port))
+ if result != 0:
+ print(f"Port {port} is available, starting server...")
+ break
+ else:
+ port += 1
+
+ uvicorn.run(app, host="0.0.0.0", port=port)
+```
+
+```bash
+python3 fake_openai_server.py
+```
diff --git a/docs/my-website/docs/load_test_sdk.md b/docs/my-website/docs/load_test_sdk.md
new file mode 100644
index 000000000..8814786b4
--- /dev/null
+++ b/docs/my-website/docs/load_test_sdk.md
@@ -0,0 +1,87 @@
+# LiteLLM SDK vs OpenAI
+
+Here is a script to load test LiteLLM vs OpenAI
+
+```python
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import random, uuid
+import time, asyncio, litellm
+# import logging
+# logging.basicConfig(level=logging.DEBUG)
+#### LITELLM PROXY ####
+litellm_client = AsyncOpenAI(
+ api_key="sk-1234", # [CHANGE THIS]
+ base_url="http://0.0.0.0:4000"
+)
+
+#### AZURE OPENAI CLIENT ####
+client = AsyncAzureOpenAI(
+ api_key="my-api-key", # [CHANGE THIS]
+ azure_endpoint="my-api-base", # [CHANGE THIS]
+ api_version="2023-07-01-preview"
+)
+
+
+#### LITELLM ROUTER ####
+model_list = [
+ {
+ "model_name": "azure-canada",
+ "litellm_params": {
+ "model": "azure/my-azure-deployment-name", # [CHANGE THIS]
+ "api_key": "my-api-key", # [CHANGE THIS]
+ "api_base": "my-api-base", # [CHANGE THIS]
+ "api_version": "2023-07-01-preview"
+ }
+ }
+]
+
+router = litellm.Router(model_list=model_list)
+
+async def openai_completion():
+ try:
+ response = await client.chat.completions.create(
+ model="gpt-35-turbo",
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ stream=True
+ )
+ return response
+ except Exception as e:
+ print(e)
+ return None
+
+
+async def router_completion():
+ try:
+ response = await router.acompletion(
+ model="azure-canada", # [CHANGE THIS]
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ stream=True
+ )
+ return response
+ except Exception as e:
+ print(e)
+ return None
+
+async def proxy_completion_non_streaming():
+ try:
+ response = await litellm_client.chat.completions.create(
+ model="sagemaker-models", # [CHANGE THIS] (if you call it something else on your proxy)
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ )
+ return response
+ except Exception as e:
+ print(e)
+ return None
+
+async def loadtest_fn():
+ start = time.time()
+ n = 500 # Number of concurrent tasks
+ tasks = [proxy_completion_non_streaming() for _ in range(n)]
+ chat_completions = await asyncio.gather(*tasks)
+ successful_completions = [c for c in chat_completions if c is not None]
+ print(n, time.time() - start, len(successful_completions))
+
+# Run the event loop to execute the async function
+asyncio.run(loadtest_fn())
+
+```
diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md
index 2fb4dd3b3..c42b07d8e 100644
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@@ -18,6 +18,7 @@ general_settings:
master_key: sk-1234 # enter your own master key, ensure it starts with 'sk-'
alerting: ["slack"] # Setup slack alerting - get alerts on LLM exceptions, Budget Alerts, Slow LLM Responses
proxy_batch_write_at: 60 # Batch write spend updates every 60s
+ database_connection_pool_limit: 10 # limit the number of database connections to = MAX Number of DB Connections/Number of instances of litellm proxy (Around 10-20 is good number)
litellm_settings:
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
diff --git a/docs/my-website/img/locust_load_test1.png b/docs/my-website/img/locust_load_test1.png
new file mode 100644
index 000000000..6ea959f45
Binary files /dev/null and b/docs/my-website/img/locust_load_test1.png differ
diff --git a/docs/my-website/img/locust_load_test2.png b/docs/my-website/img/locust_load_test2.png
new file mode 100644
index 000000000..74f979cff
Binary files /dev/null and b/docs/my-website/img/locust_load_test2.png differ
diff --git a/docs/my-website/img/locust_load_test2_setup.png b/docs/my-website/img/locust_load_test2_setup.png
new file mode 100644
index 000000000..28f457e41
Binary files /dev/null and b/docs/my-website/img/locust_load_test2_setup.png differ
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index d83a4e1fb..1fe5d6364 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -251,7 +251,16 @@ const sidebars = {
},
],
},
- "load_test",
+ {
+ type: "category",
+ label: "Load Testing",
+ items: [
+ "load_test",
+ "load_test_advanced",
+ "load_test_sdk",
+ "load_test_rpm",
+ ]
+ },
{
type: "category",
label: "Logging & Observability",