diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index ca7e1e8a9a..cd6eebc39c 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -1077,6 +1077,18 @@ Expected output on Langfuse +## LiteLLM Proxy Performance + +### Throughput - 30% Increase +LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API + + +### Latency Added - 0.00325 seconds +LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API + + + + ## Proxy CLI Arguments diff --git a/docs/my-website/img/latency.png b/docs/my-website/img/latency.png new file mode 100644 index 0000000000..4ca7964f48 Binary files /dev/null and b/docs/my-website/img/latency.png differ diff --git a/docs/my-website/img/throughput.png b/docs/my-website/img/throughput.png new file mode 100644 index 0000000000..76dc81f605 Binary files /dev/null and b/docs/my-website/img/throughput.png differ diff --git a/litellm/proxy/tests/load_test_embedding_proxy.py b/litellm/proxy/tests/load_test_embedding_proxy.py new file mode 100644 index 0000000000..45136fafc9 --- /dev/null +++ b/litellm/proxy/tests/load_test_embedding_proxy.py @@ -0,0 +1,101 @@ +# test time it takes to make 100 concurrent embedding requests to OpenaI + +import sys, os +import traceback +from dotenv import load_dotenv + +load_dotenv() +import os, io + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import pytest + + +import litellm +litellm.set_verbose=False + + + +question = "embed this very long text" * 100 + + +# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array. +# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere +# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions + +import concurrent.futures +import random +import time + + +# Function to make concurrent calls to OpenAI API +def make_openai_completion(question): + try: + start_time = time.time() + import openai + client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'], base_url="http://0.0.0.0:8000") #base_url="http://0.0.0.0:8000", + response = client.embeddings.create( + model="text-embedding-ada-002", + input=[question], + ) + print(response) + end_time = time.time() + + # Log the request details + # with open("request_log.txt", "a") as log_file: + # log_file.write( + # f"Question: {question[:100]}\nResponse ID:{response.id} Content:{response.choices[0].message.content[:10]}\nTime: {end_time - start_time:.2f} seconds\n\n" + # ) + + return response + except Exception as e: + # Log exceptions for failed calls + # with open("error_log.txt", "a") as error_log_file: + # error_log_file.write( + # f"\nException: {str(e)}\n\n" + # ) + return None + +start_time = time.time() +# Number of concurrent calls (you can adjust this) +concurrent_calls = 500 + +# List to store the futures of concurrent calls +futures = [] + +# Make concurrent calls +with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor: + for _ in range(concurrent_calls): + futures.append(executor.submit(make_openai_completion, question)) + +# Wait for all futures to complete +concurrent.futures.wait(futures) + +# Summarize the results +successful_calls = 0 +failed_calls = 0 + +for future in futures: + if future.result() is not None: + successful_calls += 1 + else: + failed_calls += 1 +end_time = time.time() +# Calculate the duration +duration = end_time - start_time + + +print(f"Load test Summary:") +print(f"Total Requests: {concurrent_calls}") +print(f"Successful Calls: {successful_calls}") +print(f"Failed Calls: {failed_calls}") +print(f"Total Time: {duration:.2f} seconds") + +# # Display content of the logs +# with open("request_log.txt", "r") as log_file: +# print("\nRequest Log:\n", log_file.read()) + +# with open("error_log.txt", "r") as error_log_file: +# print("\nError Log:\n", error_log_file.read())