chore: introduce write queue for inference_store (#3383)

# What does this PR do? Adds a write worker queue for writes to inference store. This avoids overwhelming request processing with slow inference writes. ## Test Plan Benchmark: ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` ## RPS from 21 -> 57
2025-10-04 20:14:13 +00:00 · 2025-09-10 11:57:42 -07:00 · 2025-09-10 11:57:42 -07:00 · e980436a2e
commit e980436a2e
parent e6edc1f934
7 changed files with 139 additions and 22 deletions
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -58,14 +58,6 @@ class BenchmarkStats:
        
        print(f"\n{'='*60}")
        print(f"BENCHMARK RESULTS")
-        print(f"{'='*60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
        
        print(f"\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
@ -106,6 +98,15 @@ class BenchmarkStats:
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        
+        print(f"{'='*60}")
+        print(f"Total time: {total_time:.2f}s")
+        print(f"Concurrent users: {self.concurrent_users}")
+        print(f"Total requests: {self.total_requests}")
+        print(f"Successful requests: {self.success_count}")
+        print(f"Failed requests: {len(self.errors)}")
+        print(f"Success rate: {success_rate:.1f}%")
+        print(f"Requests per second: {self.success_count / total_time:.2f}")
+        
        if self.errors:
            print(f"\nErrors (showing first 5):")
            for error in self.errors[:5]:
@ -215,7 +216,7 @@ class LlamaStackBenchmark:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
+                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break