forked from phoenix/litellm-mirror
docs(routing.md): add queueing to docs
This commit is contained in:
parent
a2681e353f
commit
9d97082eed
9 changed files with 244 additions and 261 deletions
|
@ -67,6 +67,7 @@ def is_port_in_use(port):
|
|||
@click.option('--headers', default=None, help='headers for the API call')
|
||||
@click.option('--save', is_flag=True, type=bool, help='Save the model-specific config')
|
||||
@click.option('--debug', default=False, is_flag=True, type=bool, help='To debug the input')
|
||||
@click.option('--use_queue', default=False, is_flag=True, type=bool, help='To use celery workers for async endpoints')
|
||||
@click.option('--temperature', default=None, type=float, help='Set temperature for the model')
|
||||
@click.option('--max_tokens', default=None, type=int, help='Set max tokens for the model')
|
||||
@click.option('--request_timeout', default=600, type=int, help='Set timeout in seconds for completion calls')
|
||||
|
@ -78,8 +79,10 @@ def is_port_in_use(port):
|
|||
@click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`')
|
||||
@click.option('--logs', flag_value=False, type=int, help='Gets the "n" most recent logs. By default gets most recent log.')
|
||||
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
|
||||
@click.option('--test_async', default=False, is_flag=True, help='Calls async endpoints /queue/requests and /queue/response')
|
||||
@click.option('--num_requests', default=10, type=int, help='Number of requests to hit async endpoint with')
|
||||
@click.option('--local', is_flag=True, default=False, help='for local debugging')
|
||||
def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, num_workers):
|
||||
def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, num_workers, test_async, num_requests, use_queue):
|
||||
global feature_telemetry
|
||||
args = locals()
|
||||
if local:
|
||||
|
@ -112,6 +115,72 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
|||
return
|
||||
if model and "ollama" in model:
|
||||
run_ollama_serve()
|
||||
if test_async is True:
|
||||
import requests, concurrent, time
|
||||
api_base = f"http://{host}:{port}"
|
||||
|
||||
def _make_openai_completion():
|
||||
data = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": "Write a short poem about the moon"}]
|
||||
}
|
||||
|
||||
response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
|
||||
|
||||
response = response.json()
|
||||
|
||||
while True:
|
||||
try:
|
||||
url = response["url"]
|
||||
polling_url = f"{api_base}{url}"
|
||||
polling_response = requests.get(polling_url)
|
||||
polling_response = polling_response.json()
|
||||
print("\n RESPONSE FROM POLLING JOB", polling_response)
|
||||
status = polling_response["status"]
|
||||
if status == "finished":
|
||||
llm_response = polling_response["result"]
|
||||
with open("response_log.txt", "a") as log_file:
|
||||
log_file.write(
|
||||
f"Response ID: {llm_response.get('id', 'NA')}\nLLM Response: {llm_response}\nTime: {end_time - start_time:.2f} seconds\n\n"
|
||||
)
|
||||
|
||||
break
|
||||
print(f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}")
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print("got exception in polling", e)
|
||||
break
|
||||
|
||||
# Number of concurrent calls (you can adjust this)
|
||||
concurrent_calls = num_requests
|
||||
|
||||
# List to store the futures of concurrent calls
|
||||
futures = []
|
||||
|
||||
# Make concurrent calls
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor:
|
||||
for _ in range(concurrent_calls):
|
||||
futures.append(executor.submit(_make_openai_completion))
|
||||
|
||||
# Wait for all futures to complete
|
||||
concurrent.futures.wait(futures)
|
||||
|
||||
# Summarize the results
|
||||
successful_calls = 0
|
||||
failed_calls = 0
|
||||
|
||||
for future in futures:
|
||||
if future.done():
|
||||
if future.result() is not None:
|
||||
successful_calls += 1
|
||||
else:
|
||||
failed_calls += 1
|
||||
|
||||
print(f"Load test Summary:")
|
||||
print(f"Total Requests: {concurrent_calls}")
|
||||
print(f"Successful Calls: {successful_calls}")
|
||||
print(f"Failed Calls: {failed_calls}")
|
||||
return
|
||||
if test != False:
|
||||
click.echo('\nLiteLLM: Making a test ChatCompletions request to your proxy')
|
||||
import openai
|
||||
|
@ -152,7 +221,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
|||
else:
|
||||
if headers:
|
||||
headers = json.loads(headers)
|
||||
save_worker_config(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
|
||||
save_worker_config(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config, use_queue=use_queue)
|
||||
try:
|
||||
import uvicorn
|
||||
except:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue