docs(routing.md): update routing docs

2023-11-21 19:32:41 -08:00 · 2023-11-21 19:32:41 -08:00 · 826f56a6a0
commit 826f56a6a0
parent fd3895878d
3 changed files with 15 additions and 11 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -247,16 +247,24 @@ print(f"response: {response}")
 If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md#multiple-instances-of-1-model)
-## Queuing
+## Queuing (Beta)
 ### Quick Start 
 This requires a [Redis DB](https://redis.com/) to work. 
 Our implementation uses LiteLLM's proxy server + Celery workers to process up to 100 req./s
 :::info
 This is pretty new, and might have bugs. Any contributions to improving our implementation are welcome
 :::
 [**See Code**](https://github.com/BerriAI/litellm/blob/fbf9cab5b9e35df524e2c9953180c58d92e4cd97/litellm/proxy/proxy_server.py#L589)
 ### Quick Start 
 1. Add Redis credentials in a .env file
 ```python
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -139,11 +139,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
                    status = polling_response["status"]
                    if status == "finished":
                        llm_response = polling_response["result"]
                        with open("response_log.txt", "a") as log_file:
                            log_file.write(
                                f"Response ID: {llm_response.get('id', 'NA')}\nLLM Response: {llm_response}\nTime: {end_time - start_time:.2f} seconds\n\n"
                            )
                        break
                    print(f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}")
                    time.sleep(0.5)
@ -156,7 +151,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
        # List to store the futures of concurrent calls
        futures = []
-
+        start_time = time.time()
        # Make concurrent calls
        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_calls) as executor:
            for _ in range(concurrent_calls):
@ -175,7 +170,8 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
                    successful_calls += 1
                else:
                    failed_calls += 1
-
+        end_time = time.time()
        print(f"Elapsed Time: {end_time-start_time}")
        print(f"Load test Summary:")
        print(f"Total Requests: {concurrent_calls}")
        print(f"Successful Calls: {successful_calls}")
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -618,7 +618,7 @@ async def async_queue_response(request: Request, task_id: str):
    try: 
        job = async_result(task_id, app=celery_app_conn)
        if job.ready():
-            return job.result
+            return {"status": "finished", "result": job.result}
        else:
            return {'status': 'queued'}
    except Exception as e: