chore: introduce write queue for inference_store (#3383)

# What does this PR do? Adds a write worker queue for writes to inference store. This avoids overwhelming request processing with slow inference writes. ## Test Plan Benchmark: ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` ## RPS from 21 -> 57
2025-12-03 18:00:36 +00:00 · 2025-09-10 11:57:42 -07:00 · 2025-09-10 11:57:42 -07:00 · e980436a2e
commit e980436a2e
parent e6edc1f934
7 changed files with 139 additions and 22 deletions
--- a/llama_stack/core/routers/init.py
+++ b/llama_stack/core/routers/init.py
@ -78,7 +78,10 @@ async def get_auto_router_impl(

    # TODO: move pass configs to routers instead
    if api == Api.inference and run_config.inference_store:
-        inference_store = InferenceStore(run_config.inference_store, policy)
+        inference_store = InferenceStore(
+            config=run_config.inference_store,
+            policy=policy,
+        )
        await inference_store.initialize()
        api_to_dep_impl["store"] = inference_store

--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -90,6 +90,11 @@ class InferenceRouter(Inference):

    async def shutdown(self) -> None:
        logger.debug("InferenceRouter.shutdown")
+        if self.store:
+            try:
+                await self.store.shutdown()
+            except Exception as e:
+                logger.warning(f"Error during InferenceStore shutdown: {e}")

    async def register_model(
        self,