chore: async inference store write (#3318)

# What does this PR do? ## Test Plan ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 30 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` Before: ============================================================ BENCHMARK RESULTS ============================================================ Total time: 30.00s Concurrent users: 50 Total requests: 1267 Successful requests: 1267 Failed requests: 0 Success rate: 100.0% Requests per second: 42.23 After: ============================================================ BENCHMARK RESULTS ============================================================ Total time: 30.00s Concurrent users: 50 Total requests: 1449 Successful requests: 1449 Failed requests: 0 Success rate: 100.0% Requests per second: 48.30
2025-12-03 18:00:36 +00:00 · 2025-09-04 11:37:46 -07:00 · 2025-09-04 11:37:46 -07:00 · bcc7f2c7d0
commit bcc7f2c7d0
parent 5bbca56cfc
2 changed files with 10 additions and 2 deletions
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
 - safety
 - telemetry
 - tool_runtime
 - vector_io
@ -30,6 +31,11 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -95,6 +101,8 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
 shields:
 - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -527,7 +527,7 @@ class InferenceRouter(Inference):
        # Store the response with the ID that will be returned to the client
        if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))
        if self.telemetry:
            metrics = self._construct_metrics(
@ -855,4 +855,4 @@ class InferenceRouter(Inference):
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))