diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md index f568b5696..94165fb7b 100644 --- a/docs/my-website/docs/load_test.md +++ b/docs/my-website/docs/load_test.md @@ -1,5 +1,84 @@ +import Image from '@theme/IdealImage'; + # 🔥 Load Test LiteLLM +## Load Test LiteLLM Proxy - 1500+ req/s + +## 1500+ concurrent requests/s + +LiteLLM proxy has been load tested to handle 1500+ concurrent req/s + +```python +import time, asyncio +from openai import AsyncOpenAI, AsyncAzureOpenAI +import uuid +import traceback + +# base_url - litellm proxy endpoint +# api_key - litellm proxy api-key, is created proxy with auth +litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234") + + +async def litellm_completion(): + # Your existing code for litellm_completion goes here + try: + response = await litellm_client.chat.completions.create( + model="azure-gpt-3.5", + messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], + ) + print(response) + return response + + except Exception as e: + # If there's an exception, log the error message + with open("error_log.txt", "a") as error_log: + error_log.write(f"Error during completion: {str(e)}\n") + pass + + +async def main(): + for i in range(1): + start = time.time() + n = 1500 # Number of concurrent tasks + tasks = [litellm_completion() for _ in range(n)] + + chat_completions = await asyncio.gather(*tasks) + + successful_completions = [c for c in chat_completions if c is not None] + + # Write errors to error_log.txt + with open("error_log.txt", "a") as error_log: + for completion in chat_completions: + if isinstance(completion, str): + error_log.write(completion + "\n") + + print(n, time.time() - start, len(successful_completions)) + time.sleep(10) + + +if __name__ == "__main__": + # Blank out contents of error_log.txt + open("error_log.txt", "w").close() + + asyncio.run(main()) + +``` + +### Throughput - 30% Increase +LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API + + +### Latency Added - 0.00325 seconds +LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API + + + +### Testing LiteLLM Proxy with Locust +- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures + + + +## Load Test LiteLLM SDK vs OpenAI Here is a script to load test LiteLLM vs OpenAI ```python @@ -84,4 +163,5 @@ async def loadtest_fn(): # Run the event loop to execute the async function asyncio.run(loadtest_fn()) -``` \ No newline at end of file +``` + diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 6de8625d0..4b51f094c 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -350,17 +350,3 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in Your LiteLLM container should be running now on the defined port e.g. `8000`. - - - -## LiteLLM Proxy Performance - -LiteLLM proxy has been load tested to handle 1500 req/s. - -### Throughput - 30% Increase -LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API - - -### Latency Added - 0.00325 seconds -LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API - diff --git a/docs/my-website/img/locust.png b/docs/my-website/img/locust.png new file mode 100644 index 000000000..1bcedf1d0 Binary files /dev/null and b/docs/my-website/img/locust.png differ