forked from phoenix/litellm-mirror
refactor: add black formatting
This commit is contained in:
parent
b87d630b0a
commit
4905929de3
156 changed files with 19723 additions and 10869 deletions
|
@ -9,33 +9,37 @@ import os
|
|||
|
||||
# Define the list of models to benchmark
|
||||
# select any LLM listed here: https://docs.litellm.ai/docs/providers
|
||||
models = ['gpt-3.5-turbo', 'claude-2']
|
||||
models = ["gpt-3.5-turbo", "claude-2"]
|
||||
|
||||
# Enter LLM API keys
|
||||
# https://docs.litellm.ai/docs/providers
|
||||
os.environ['OPENAI_API_KEY'] = ""
|
||||
os.environ['ANTHROPIC_API_KEY'] = ""
|
||||
os.environ["OPENAI_API_KEY"] = ""
|
||||
os.environ["ANTHROPIC_API_KEY"] = ""
|
||||
|
||||
# List of questions to benchmark (replace with your questions)
|
||||
questions = [
|
||||
"When will BerriAI IPO?",
|
||||
"When will LiteLLM hit $100M ARR?"
|
||||
]
|
||||
questions = ["When will BerriAI IPO?", "When will LiteLLM hit $100M ARR?"]
|
||||
|
||||
# Enter your system prompt here
|
||||
# Enter your system prompt here
|
||||
system_prompt = """
|
||||
You are LiteLLMs helpful assistant
|
||||
"""
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--system-prompt', default="You are a helpful assistant that can answer questions.", help="System prompt for the conversation.")
|
||||
@click.option(
|
||||
"--system-prompt",
|
||||
default="You are a helpful assistant that can answer questions.",
|
||||
help="System prompt for the conversation.",
|
||||
)
|
||||
def main(system_prompt):
|
||||
for question in questions:
|
||||
data = [] # Data for the current question
|
||||
|
||||
with tqdm(total=len(models)) as pbar:
|
||||
for model in models:
|
||||
colored_description = colored(f"Running question: {question} for model: {model}", 'green')
|
||||
colored_description = colored(
|
||||
f"Running question: {question} for model: {model}", "green"
|
||||
)
|
||||
pbar.set_description(colored_description)
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -44,35 +48,43 @@ def main(system_prompt):
|
|||
max_tokens=500,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": question}
|
||||
{"role": "user", "content": question},
|
||||
],
|
||||
)
|
||||
|
||||
end = time.time()
|
||||
total_time = end - start_time
|
||||
cost = completion_cost(completion_response=response)
|
||||
raw_response = response['choices'][0]['message']['content']
|
||||
raw_response = response["choices"][0]["message"]["content"]
|
||||
|
||||
data.append({
|
||||
'Model': colored(model, 'light_blue'),
|
||||
'Response': raw_response, # Colorize the response
|
||||
'ResponseTime': colored(f"{total_time:.2f} seconds", "red"),
|
||||
'Cost': colored(f"${cost:.6f}", 'green'), # Colorize the cost
|
||||
})
|
||||
data.append(
|
||||
{
|
||||
"Model": colored(model, "light_blue"),
|
||||
"Response": raw_response, # Colorize the response
|
||||
"ResponseTime": colored(f"{total_time:.2f} seconds", "red"),
|
||||
"Cost": colored(f"${cost:.6f}", "green"), # Colorize the cost
|
||||
}
|
||||
)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Separate headers from the data
|
||||
headers = ['Model', 'Response', 'Response Time (seconds)', 'Cost ($)']
|
||||
headers = ["Model", "Response", "Response Time (seconds)", "Cost ($)"]
|
||||
colwidths = [15, 80, 15, 10]
|
||||
|
||||
# Create a nicely formatted table for the current question
|
||||
table = tabulate([list(d.values()) for d in data], headers, tablefmt="grid", maxcolwidths=colwidths)
|
||||
|
||||
table = tabulate(
|
||||
[list(d.values()) for d in data],
|
||||
headers,
|
||||
tablefmt="grid",
|
||||
maxcolwidths=colwidths,
|
||||
)
|
||||
|
||||
# Print the table for the current question
|
||||
colored_question = colored(question, 'green')
|
||||
colored_question = colored(question, "green")
|
||||
click.echo(f"\nBenchmark Results for '{colored_question}':")
|
||||
click.echo(table) # Display the formatted table
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,25 +1,22 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
import litellm
|
||||
from litellm import embedding, completion, completion_cost
|
||||
|
||||
from autoevals.llm import *
|
||||
|
||||
###################
|
||||
import litellm
|
||||
|
||||
# litellm completion call
|
||||
question = "which country has the highest population"
|
||||
response = litellm.completion(
|
||||
model = "gpt-3.5-turbo",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": question
|
||||
}
|
||||
],
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": question}],
|
||||
)
|
||||
print(response)
|
||||
# use the auto eval Factuality() evaluator
|
||||
|
@ -27,9 +24,11 @@ print(response)
|
|||
print("calling evaluator")
|
||||
evaluator = Factuality()
|
||||
result = evaluator(
|
||||
output=response.choices[0]["message"]["content"], # response from litellm.completion()
|
||||
expected="India", # expected output
|
||||
input=question # question passed to litellm.completion
|
||||
output=response.choices[0]["message"][
|
||||
"content"
|
||||
], # response from litellm.completion()
|
||||
expected="India", # expected output
|
||||
input=question, # question passed to litellm.completion
|
||||
)
|
||||
|
||||
print(result)
|
||||
print(result)
|
||||
|
|
|
@ -4,9 +4,10 @@ from flask_cors import CORS
|
|||
import traceback
|
||||
import litellm
|
||||
from util import handle_error
|
||||
from litellm import completion
|
||||
import os, dotenv, time
|
||||
from litellm import completion
|
||||
import os, dotenv, time
|
||||
import json
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# TODO: set your keys in .env or here:
|
||||
|
@ -19,57 +20,72 @@ verbose = True
|
|||
|
||||
# litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/
|
||||
######### PROMPT LOGGING ##########
|
||||
os.environ["PROMPTLAYER_API_KEY"] = "" # set your promptlayer key here - https://promptlayer.com/
|
||||
os.environ[
|
||||
"PROMPTLAYER_API_KEY"
|
||||
] = "" # set your promptlayer key here - https://promptlayer.com/
|
||||
|
||||
# set callbacks
|
||||
litellm.success_callback = ["promptlayer"]
|
||||
############ HELPER FUNCTIONS ###################################
|
||||
|
||||
|
||||
def print_verbose(print_statement):
|
||||
if verbose:
|
||||
print(print_statement)
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
@app.route('/')
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return 'received!', 200
|
||||
return "received!", 200
|
||||
|
||||
|
||||
def data_generator(response):
|
||||
for chunk in response:
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
|
||||
@app.route('/chat/completions', methods=["POST"])
|
||||
|
||||
@app.route("/chat/completions", methods=["POST"])
|
||||
def api_completion():
|
||||
data = request.json
|
||||
start_time = time.time()
|
||||
if data.get('stream') == "True":
|
||||
data['stream'] = True # convert to boolean
|
||||
start_time = time.time()
|
||||
if data.get("stream") == "True":
|
||||
data["stream"] = True # convert to boolean
|
||||
try:
|
||||
if "prompt" not in data:
|
||||
raise ValueError("data needs to have prompt")
|
||||
data["model"] = "togethercomputer/CodeLlama-34b-Instruct" # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
|
||||
data[
|
||||
"model"
|
||||
] = "togethercomputer/CodeLlama-34b-Instruct" # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
|
||||
# COMPLETION CALL
|
||||
system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."
|
||||
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": data.pop("prompt")}]
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": data.pop("prompt")},
|
||||
]
|
||||
data["messages"] = messages
|
||||
print(f"data: {data}")
|
||||
response = completion(**data)
|
||||
## LOG SUCCESS
|
||||
end_time = time.time()
|
||||
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
||||
return Response(data_generator(response), mimetype='text/event-stream')
|
||||
end_time = time.time()
|
||||
if (
|
||||
"stream" in data and data["stream"] == True
|
||||
): # use generate_responses to stream responses
|
||||
return Response(data_generator(response), mimetype="text/event-stream")
|
||||
except Exception as e:
|
||||
# call handle_error function
|
||||
print_verbose(f"Got Error api_completion(): {traceback.format_exc()}")
|
||||
## LOG FAILURE
|
||||
end_time = time.time()
|
||||
end_time = time.time()
|
||||
traceback_exception = traceback.format_exc()
|
||||
return handle_error(data=data)
|
||||
return response
|
||||
|
||||
@app.route('/get_models', methods=["POST"])
|
||||
|
||||
@app.route("/get_models", methods=["POST"])
|
||||
def get_models():
|
||||
try:
|
||||
return litellm.model_list
|
||||
|
@ -78,7 +94,8 @@ def get_models():
|
|||
response = {"error": str(e)}
|
||||
return response, 200
|
||||
|
||||
if __name__ == "__main__":
|
||||
from waitress import serve
|
||||
serve(app, host="0.0.0.0", port=4000, threads=500)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from waitress import serve
|
||||
|
||||
serve(app, host="0.0.0.0", port=4000, threads=500)
|
||||
|
|
|
@ -3,27 +3,28 @@ from urllib.parse import urlparse, parse_qs
|
|||
|
||||
|
||||
def get_next_url(response):
|
||||
"""
|
||||
"""
|
||||
Function to get 'next' url from Link header
|
||||
:param response: response from requests
|
||||
:return: next url or None
|
||||
"""
|
||||
if 'link' not in response.headers:
|
||||
return None
|
||||
headers = response.headers
|
||||
if "link" not in response.headers:
|
||||
return None
|
||||
headers = response.headers
|
||||
|
||||
next_url = headers['Link']
|
||||
print(next_url)
|
||||
start_index = next_url.find("<")
|
||||
end_index = next_url.find(">")
|
||||
next_url = headers["Link"]
|
||||
print(next_url)
|
||||
start_index = next_url.find("<")
|
||||
end_index = next_url.find(">")
|
||||
|
||||
return next_url[1:end_index]
|
||||
|
||||
return next_url[1:end_index]
|
||||
|
||||
def get_models(url):
|
||||
"""
|
||||
Function to retrieve all models from paginated endpoint
|
||||
:param url: base url to make GET request
|
||||
:return: list of all models
|
||||
Function to retrieve all models from paginated endpoint
|
||||
:param url: base url to make GET request
|
||||
:return: list of all models
|
||||
"""
|
||||
models = []
|
||||
while url:
|
||||
|
@ -36,19 +37,21 @@ def get_models(url):
|
|||
models.extend(payload)
|
||||
return models
|
||||
|
||||
|
||||
def get_cleaned_models(models):
|
||||
"""
|
||||
Function to clean retrieved models
|
||||
:param models: list of retrieved models
|
||||
:return: list of cleaned models
|
||||
Function to clean retrieved models
|
||||
:param models: list of retrieved models
|
||||
:return: list of cleaned models
|
||||
"""
|
||||
cleaned_models = []
|
||||
for model in models:
|
||||
cleaned_models.append(model["id"])
|
||||
return cleaned_models
|
||||
|
||||
|
||||
# Get text-generation models
|
||||
url = 'https://huggingface.co/api/models?filter=text-generation-inference'
|
||||
url = "https://huggingface.co/api/models?filter=text-generation-inference"
|
||||
text_generation_models = get_models(url)
|
||||
cleaned_text_generation_models = get_cleaned_models(text_generation_models)
|
||||
|
||||
|
@ -56,7 +59,7 @@ print(cleaned_text_generation_models)
|
|||
|
||||
|
||||
# Get conversational models
|
||||
url = 'https://huggingface.co/api/models?filter=conversational'
|
||||
url = "https://huggingface.co/api/models?filter=conversational"
|
||||
conversational_models = get_models(url)
|
||||
cleaned_conversational_models = get_cleaned_models(conversational_models)
|
||||
|
||||
|
@ -65,19 +68,23 @@ print(cleaned_conversational_models)
|
|||
|
||||
def write_to_txt(cleaned_models, filename):
|
||||
"""
|
||||
Function to write the contents of a list to a text file
|
||||
:param cleaned_models: list of cleaned models
|
||||
:param filename: name of the text file
|
||||
Function to write the contents of a list to a text file
|
||||
:param cleaned_models: list of cleaned models
|
||||
:param filename: name of the text file
|
||||
"""
|
||||
with open(filename, 'w') as f:
|
||||
with open(filename, "w") as f:
|
||||
for item in cleaned_models:
|
||||
f.write("%s\n" % item)
|
||||
|
||||
|
||||
# Write contents of cleaned_text_generation_models to text_generation_models.txt
|
||||
write_to_txt(cleaned_text_generation_models, 'huggingface_llms_metadata/hf_text_generation_models.txt')
|
||||
write_to_txt(
|
||||
cleaned_text_generation_models,
|
||||
"huggingface_llms_metadata/hf_text_generation_models.txt",
|
||||
)
|
||||
|
||||
# Write contents of cleaned_conversational_models to conversational_models.txt
|
||||
write_to_txt(cleaned_conversational_models, 'huggingface_llms_metadata/hf_conversational_models.txt')
|
||||
|
||||
|
||||
write_to_txt(
|
||||
cleaned_conversational_models,
|
||||
"huggingface_llms_metadata/hf_conversational_models.txt",
|
||||
)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import openai
|
||||
|
||||
api_base = f"http://0.0.0.0:8000"
|
||||
|
@ -8,29 +7,29 @@ openai.api_key = "temp-key"
|
|||
print(openai.api_base)
|
||||
|
||||
|
||||
print(f'LiteLLM: response from proxy with streaming')
|
||||
print(f"LiteLLM: response from proxy with streaming")
|
||||
response = openai.ChatCompletion.create(
|
||||
model="ollama/llama2",
|
||||
messages = [
|
||||
model="ollama/llama2",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, acknowledge that you got it"
|
||||
"content": "this is a test request, acknowledge that you got it",
|
||||
}
|
||||
],
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(f'LiteLLM: streaming response from proxy {chunk}')
|
||||
print(f"LiteLLM: streaming response from proxy {chunk}")
|
||||
|
||||
response = openai.ChatCompletion.create(
|
||||
model="ollama/llama2",
|
||||
messages = [
|
||||
model="ollama/llama2",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, acknowledge that you got it"
|
||||
"content": "this is a test request, acknowledge that you got it",
|
||||
}
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
print(f'LiteLLM: response from proxy {response}')
|
||||
print(f"LiteLLM: response from proxy {response}")
|
||||
|
|
|
@ -12,42 +12,51 @@ import pytest
|
|||
|
||||
from litellm import Router
|
||||
import litellm
|
||||
litellm.set_verbose=False
|
||||
|
||||
litellm.set_verbose = False
|
||||
os.environ.pop("AZURE_AD_TOKEN")
|
||||
|
||||
model_list = [{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
}]
|
||||
model_list = [
|
||||
{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
]
|
||||
router = Router(model_list=model_list)
|
||||
|
||||
|
||||
file_paths = ["test_questions/question1.txt", "test_questions/question2.txt", "test_questions/question3.txt"]
|
||||
file_paths = [
|
||||
"test_questions/question1.txt",
|
||||
"test_questions/question2.txt",
|
||||
"test_questions/question3.txt",
|
||||
]
|
||||
questions = []
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
print(file_path)
|
||||
with open(file_path, 'r') as file:
|
||||
with open(file_path, "r") as file:
|
||||
content = file.read()
|
||||
questions.append(content)
|
||||
except FileNotFoundError as e:
|
||||
|
@ -59,10 +68,9 @@ for file_path in file_paths:
|
|||
# print(q)
|
||||
|
||||
|
||||
|
||||
# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array.
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
|
||||
import concurrent.futures
|
||||
import random
|
||||
|
@ -74,10 +82,18 @@ def make_openai_completion(question):
|
|||
try:
|
||||
start_time = time.time()
|
||||
import openai
|
||||
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'], base_url="http://0.0.0.0:8000") #base_url="http://0.0.0.0:8000",
|
||||
|
||||
client = openai.OpenAI(
|
||||
api_key=os.environ["OPENAI_API_KEY"], base_url="http://0.0.0.0:8000"
|
||||
) # base_url="http://0.0.0.0:8000",
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "system", "content": f"You are a helpful assistant. Answer this question{question}"}],
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are a helpful assistant. Answer this question{question}",
|
||||
}
|
||||
],
|
||||
)
|
||||
print(response)
|
||||
end_time = time.time()
|
||||
|
@ -92,11 +108,10 @@ def make_openai_completion(question):
|
|||
except Exception as e:
|
||||
# Log exceptions for failed calls
|
||||
with open("error_log.txt", "a") as error_log_file:
|
||||
error_log_file.write(
|
||||
f"Question: {question[:100]}\nException: {str(e)}\n\n"
|
||||
)
|
||||
error_log_file.write(f"Question: {question[:100]}\nException: {str(e)}\n\n")
|
||||
return None
|
||||
|
||||
|
||||
# Number of concurrent calls (you can adjust this)
|
||||
concurrent_calls = 100
|
||||
|
||||
|
@ -133,4 +148,3 @@ with open("request_log.txt", "r") as log_file:
|
|||
|
||||
with open("error_log.txt", "r") as error_log_file:
|
||||
print("\nError Log:\n", error_log_file.read())
|
||||
|
||||
|
|
|
@ -12,42 +12,51 @@ import pytest
|
|||
|
||||
from litellm import Router
|
||||
import litellm
|
||||
litellm.set_verbose=False
|
||||
|
||||
litellm.set_verbose = False
|
||||
# os.environ.pop("AZURE_AD_TOKEN")
|
||||
|
||||
model_list = [{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
}]
|
||||
model_list = [
|
||||
{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
]
|
||||
router = Router(model_list=model_list)
|
||||
|
||||
|
||||
file_paths = ["test_questions/question1.txt", "test_questions/question2.txt", "test_questions/question3.txt"]
|
||||
file_paths = [
|
||||
"test_questions/question1.txt",
|
||||
"test_questions/question2.txt",
|
||||
"test_questions/question3.txt",
|
||||
]
|
||||
questions = []
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
print(file_path)
|
||||
with open(file_path, 'r') as file:
|
||||
with open(file_path, "r") as file:
|
||||
content = file.read()
|
||||
questions.append(content)
|
||||
except FileNotFoundError as e:
|
||||
|
@ -59,10 +68,9 @@ for file_path in file_paths:
|
|||
# print(q)
|
||||
|
||||
|
||||
|
||||
# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array.
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
|
||||
import concurrent.futures
|
||||
import random
|
||||
|
@ -76,9 +84,12 @@ def make_openai_completion(question):
|
|||
import requests
|
||||
|
||||
data = {
|
||||
'model': 'gpt-3.5-turbo',
|
||||
'messages': [
|
||||
{'role': 'system', 'content': f'You are a helpful assistant. Answer this question{question}'},
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are a helpful assistant. Answer this question{question}",
|
||||
},
|
||||
],
|
||||
}
|
||||
response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
|
||||
|
@ -89,8 +100,8 @@ def make_openai_completion(question):
|
|||
log_file.write(
|
||||
f"Question: {question[:100]}\nResponse ID: {response.get('id', 'N/A')} Url: {response.get('url', 'N/A')}\nTime: {end_time - start_time:.2f} seconds\n\n"
|
||||
)
|
||||
|
||||
# polling the url
|
||||
|
||||
# polling the url
|
||||
while True:
|
||||
try:
|
||||
url = response["url"]
|
||||
|
@ -107,7 +118,9 @@ def make_openai_completion(question):
|
|||
)
|
||||
|
||||
break
|
||||
print(f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}")
|
||||
print(
|
||||
f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}"
|
||||
)
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print("got exception in polling", e)
|
||||
|
@ -117,11 +130,10 @@ def make_openai_completion(question):
|
|||
except Exception as e:
|
||||
# Log exceptions for failed calls
|
||||
with open("error_log.txt", "a") as error_log_file:
|
||||
error_log_file.write(
|
||||
f"Question: {question[:100]}\nException: {str(e)}\n\n"
|
||||
)
|
||||
error_log_file.write(f"Question: {question[:100]}\nException: {str(e)}\n\n")
|
||||
return None
|
||||
|
||||
|
||||
# Number of concurrent calls (you can adjust this)
|
||||
concurrent_calls = 10
|
||||
|
||||
|
@ -142,7 +154,7 @@ successful_calls = 0
|
|||
failed_calls = 0
|
||||
|
||||
for future in futures:
|
||||
if future.done():
|
||||
if future.done():
|
||||
if future.result() is not None:
|
||||
successful_calls += 1
|
||||
else:
|
||||
|
@ -152,4 +164,3 @@ print(f"Load test Summary:")
|
|||
print(f"Total Requests: {concurrent_calls}")
|
||||
print(f"Successful Calls: {successful_calls}")
|
||||
print(f"Failed Calls: {failed_calls}")
|
||||
|
||||
|
|
|
@ -12,42 +12,51 @@ import pytest
|
|||
|
||||
from litellm import Router
|
||||
import litellm
|
||||
litellm.set_verbose=False
|
||||
|
||||
litellm.set_verbose = False
|
||||
os.environ.pop("AZURE_AD_TOKEN")
|
||||
|
||||
model_list = [{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
}
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
}]
|
||||
model_list = [
|
||||
{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # model alias
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2", # actual model name
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
]
|
||||
router = Router(model_list=model_list)
|
||||
|
||||
|
||||
file_paths = ["test_questions/question1.txt", "test_questions/question2.txt", "test_questions/question3.txt"]
|
||||
file_paths = [
|
||||
"test_questions/question1.txt",
|
||||
"test_questions/question2.txt",
|
||||
"test_questions/question3.txt",
|
||||
]
|
||||
questions = []
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
print(file_path)
|
||||
with open(file_path, 'r') as file:
|
||||
with open(file_path, "r") as file:
|
||||
content = file.read()
|
||||
questions.append(content)
|
||||
except FileNotFoundError as e:
|
||||
|
@ -59,10 +68,9 @@ for file_path in file_paths:
|
|||
# print(q)
|
||||
|
||||
|
||||
|
||||
# make X concurrent calls to litellm.completion(model=gpt-35-turbo, messages=[]), pick a random question in questions array.
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
# Allow me to tune X concurrent calls.. Log question, output/exception, response time somewhere
|
||||
# show me a summary of requests made, success full calls, failed calls. For failed calls show me the exceptions
|
||||
|
||||
import concurrent.futures
|
||||
import random
|
||||
|
@ -75,7 +83,12 @@ def make_openai_completion(question):
|
|||
start_time = time.time()
|
||||
response = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "system", "content": f"You are a helpful assistant. Answer this question{question}"}],
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are a helpful assistant. Answer this question{question}",
|
||||
}
|
||||
],
|
||||
)
|
||||
print(response)
|
||||
end_time = time.time()
|
||||
|
@ -90,11 +103,10 @@ def make_openai_completion(question):
|
|||
except Exception as e:
|
||||
# Log exceptions for failed calls
|
||||
with open("error_log.txt", "a") as error_log_file:
|
||||
error_log_file.write(
|
||||
f"Question: {question[:100]}\nException: {str(e)}\n\n"
|
||||
)
|
||||
error_log_file.write(f"Question: {question[:100]}\nException: {str(e)}\n\n")
|
||||
return None
|
||||
|
||||
|
||||
# Number of concurrent calls (you can adjust this)
|
||||
concurrent_calls = 150
|
||||
|
||||
|
@ -131,4 +143,3 @@ with open("request_log.txt", "r") as log_file:
|
|||
|
||||
with open("error_log.txt", "r") as error_log_file:
|
||||
print("\nError Log:\n", error_log_file.read())
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue