From 826f56a6a0b2a77d8100edd980adb534531a85b9 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 21 Nov 2023 19:32:41 -0800 Subject: [PATCH] docs(routing.md): update routing docs --- docs/my-website/docs/routing.md | 14 +++++++++++--- litellm/proxy/proxy_cli.py | 10 +++------- litellm/proxy/proxy_server.py | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 86c8fa302..87cefe580 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -247,16 +247,24 @@ print(f"response: {response}") If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model) -## Queuing - -### Quick Start +## Queuing (Beta) This requires a [Redis DB](https://redis.com/) to work. Our implementation uses LiteLLM's proxy server + Celery workers to process up to 100 req./s +:::info + +This is pretty new, and might have bugs. Any contributions to improving our implementation are welcome + +::: + + [**See Code**](https://github.com/BerriAI/litellm/blob/fbf9cab5b9e35df524e2c9953180c58d92e4cd97/litellm/proxy/proxy_server.py#L589) + +### Quick Start + 1. Add Redis credentials in a .env file ```python diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py index 629705749..6a649f5bf 100644 --- a/litellm/proxy/proxy_cli.py +++ b/litellm/proxy/proxy_cli.py @@ -139,11 +139,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers status = polling_response["status"] if status == "finished": llm_response = polling_response["result"] - with open("response_log.txt", "a") as log_file: - log_file.write( - f"Response ID: {llm_response.get('id', 'NA')}\nLLM Response: {llm_response}\nTime: {end_time - start_time:.2f} seconds\n\n" - ) - break print(f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}") time.sleep(0.5) @@ -156,7 +151,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers # List to store the futures of concurrent calls futures = [] - + start_time = time.time() # Make concurrent calls with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor: for _ in range(concurrent_calls): @@ -175,7 +170,8 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers successful_calls += 1 else: failed_calls += 1 - + end_time = time.time() + print(f"Elapsed Time: {end_time-start_time}") print(f"Load test Summary:") print(f"Total Requests: {concurrent_calls}") print(f"Successful Calls: {successful_calls}") diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 4485e75ce..e93527eae 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -618,7 +618,7 @@ async def async_queue_response(request: Request, task_id: str): try: job = async_result(task_id, app=celery_app_conn) if job.ready(): - return job.result + return {"status": "finished", "result": job.result} else: return {'status': 'queued'} except Exception as e: