mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 10:14:26 +00:00
(docs) proxy performance
This commit is contained in:
parent
ad1672b901
commit
111c7afaca
4 changed files with 113 additions and 0 deletions
|
@ -1077,6 +1077,18 @@ Expected output on Langfuse
|
|||
|
||||
<Image img={require('../img/langfuse_small.png')} />
|
||||
|
||||
## LiteLLM Proxy Performance
|
||||
|
||||
### Throughput - 30% Increase
|
||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
||||
<Image img={require('../img/throughput.png')} />
|
||||
|
||||
### Latency Added - 0.00325 seconds
|
||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
||||
<Image img={require('../img/latency.png')} />
|
||||
|
||||
|
||||
|
||||
|
||||
## Proxy CLI Arguments
|
||||
|
||||
|
|
BIN
docs/my-website/img/latency.png
Normal file
BIN
docs/my-website/img/latency.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 66 KiB |
BIN
docs/my-website/img/throughput.png
Normal file
BIN
docs/my-website/img/throughput.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 71 KiB |
101
litellm/proxy/tests/load_test_embedding_proxy.py
Normal file
101
litellm/proxy/tests/load_test_embedding_proxy.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
# test time it takes to make 100 concurrent embedding requests to OpenaI
|
||||
|
||||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os, io
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
|
||||
|
||||
import litellm
|
||||
litellm.set_verbose=False
|
||||
|
||||
|
||||
|
||||
question = "embed this very long text" * 100
|
||||
|
||||
|
||||
# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array.
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
|
||||
import concurrent.futures
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
# Function to make concurrent calls to OpenAI API
|
||||
def make_openai_completion(question):
|
||||
try:
|
||||
start_time = time.time()
|
||||
import openai
|
||||
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'], base_url="http://0.0.0.0:8000") #base_url="http://0.0.0.0:8000",
|
||||
response = client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=[question],
|
||||
)
|
||||
print(response)
|
||||
end_time = time.time()
|
||||
|
||||
# Log the request details
|
||||
# with open("request_log.txt", "a") as log_file:
|
||||
# log_file.write(
|
||||
# f"Question: {question[:100]}\nResponse ID:{response.id} Content:{response.choices[0].message.content[:10]}\nTime: {end_time - start_time:.2f} seconds\n\n"
|
||||
# )
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
# Log exceptions for failed calls
|
||||
# with open("error_log.txt", "a") as error_log_file:
|
||||
# error_log_file.write(
|
||||
# f"\nException: {str(e)}\n\n"
|
||||
# )
|
||||
return None
|
||||
|
||||
start_time = time.time()
|
||||
# Number of concurrent calls (you can adjust this)
|
||||
concurrent_calls = 500
|
||||
|
||||
# List to store the futures of concurrent calls
|
||||
futures = []
|
||||
|
||||
# Make concurrent calls
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor:
|
||||
for _ in range(concurrent_calls):
|
||||
futures.append(executor.submit(make_openai_completion, question))
|
||||
|
||||
# Wait for all futures to complete
|
||||
concurrent.futures.wait(futures)
|
||||
|
||||
# Summarize the results
|
||||
successful_calls = 0
|
||||
failed_calls = 0
|
||||
|
||||
for future in futures:
|
||||
if future.result() is not None:
|
||||
successful_calls += 1
|
||||
else:
|
||||
failed_calls += 1
|
||||
end_time = time.time()
|
||||
# Calculate the duration
|
||||
duration = end_time - start_time
|
||||
|
||||
|
||||
print(f"Load test Summary:")
|
||||
print(f"Total Requests: {concurrent_calls}")
|
||||
print(f"Successful Calls: {successful_calls}")
|
||||
print(f"Failed Calls: {failed_calls}")
|
||||
print(f"Total Time: {duration:.2f} seconds")
|
||||
|
||||
# # Display content of the logs
|
||||
# with open("request_log.txt", "r") as log_file:
|
||||
# print("\nRequest Log:\n", log_file.read())
|
||||
|
||||
# with open("error_log.txt", "r") as error_log_file:
|
||||
# print("\nError Log:\n", error_log_file.read())
|
Loading…
Add table
Add a link
Reference in a new issue