(docs) proxy performance

2025-04-24 10:14:26 +00:00 · 2023-11-24 17:07:46 -08:00 · 2023-11-24 17:07:46 -08:00 · 111c7afaca
commit 111c7afaca
parent ad1672b901
4 changed files with 113 additions and 0 deletions
--- a/docs/my-website/docs/simple_proxy.md
+++ b/docs/my-website/docs/simple_proxy.md
@ -1077,6 +1077,18 @@ Expected output on Langfuse

 <Image img={require('../img/langfuse_small.png')} />

+## LiteLLM Proxy Performance
+
+### Throughput - 30% Increase
+LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
+<Image img={require('../img/throughput.png')} />
+
+### Latency Added - 0.00325 seconds
+LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
+<Image img={require('../img/latency.png')} />
+
+
+

 ## Proxy CLI Arguments

--- a/docs/my-website/img/latency.png
+++ b/docs/my-website/img/latency.png
--- a/docs/my-website/img/throughput.png
+++ b/docs/my-website/img/throughput.png
--- a/litellm/proxy/tests/load_test_embedding_proxy.py
+++ b/litellm/proxy/tests/load_test_embedding_proxy.py
@ -0,0 +1,101 @@
+# test time it takes to make 100 concurrent embedding requests to OpenaI 
+
+import sys, os
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os, io
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+
+
+import litellm
+litellm.set_verbose=False
+
+
+
+question = "embed this very long text" * 100
+
+
+# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array.
+#  Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere 
+# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions 
+
+import concurrent.futures
+import random
+import time
+
+
+# Function to make concurrent calls to OpenAI API
+def make_openai_completion(question):
+    try:
+        start_time = time.time()
+        import openai
+        client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'], base_url="http://0.0.0.0:8000") #base_url="http://0.0.0.0:8000",
+        response = client.embeddings.create(
+            model="text-embedding-ada-002",
+            input=[question],
+        )
+        print(response)
+        end_time = time.time()
+
+        # Log the request details
+        # with open("request_log.txt", "a") as log_file:
+        #     log_file.write(
+        #         f"Question: {question[:100]}\nResponse ID:{response.id} Content:{response.choices[0].message.content[:10]}\nTime: {end_time - start_time:.2f} seconds\n\n"
+        #     )
+
+        return response
+    except Exception as e:
+        # Log exceptions for failed calls
+        # with open("error_log.txt", "a") as error_log_file:
+        #     error_log_file.write(
+        #         f"\nException: {str(e)}\n\n"
+        #     )
+        return None
+
+start_time = time.time()
+# Number of concurrent calls (you can adjust this)
+concurrent_calls = 500
+
+# List to store the futures of concurrent calls
+futures = []
+
+# Make concurrent calls
+with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor:
+    for _ in range(concurrent_calls):
+        futures.append(executor.submit(make_openai_completion, question))
+
+# Wait for all futures to complete
+concurrent.futures.wait(futures)
+
+# Summarize the results
+successful_calls = 0
+failed_calls = 0
+
+for future in futures:
+    if future.result() is not None:
+        successful_calls += 1
+    else:
+        failed_calls += 1
+end_time = time.time()
+# Calculate the duration
+duration = end_time - start_time
+
+
+print(f"Load test Summary:")
+print(f"Total Requests: {concurrent_calls}")
+print(f"Successful Calls: {successful_calls}")
+print(f"Failed Calls: {failed_calls}")
+print(f"Total Time: {duration:.2f} seconds")
+
+# # Display content of the logs
+# with open("request_log.txt", "r") as log_file:
+#     print("\nRequest Log:\n", log_file.read())
+
+# with open("error_log.txt", "r") as error_log_file:
+#     print("\nError Log:\n", error_log_file.read())