From bcc7f2c7d0a24a4671e98134e6c28f74e6acff26 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 4 Sep 2025 11:37:46 -0700
Subject: [PATCH] chore: async inference store write (#3318)

# What does this PR do?


## Test Plan
```
cd /docs/source/distributions/k8s-benchmark
# start mock server
python openai-mock-server.py --port 8000
# start stack server
uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml
# run benchmark script
uv run python3 benchmark.py --duration 30 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct
```
Before:

============================================================
BENCHMARK RESULTS
============================================================
Total time: 30.00s
Concurrent users: 50
Total requests: 1267
Successful requests: 1267
Failed requests: 0
Success rate: 100.0%
Requests per second: 42.23


After:

============================================================
BENCHMARK RESULTS
============================================================
Total time: 30.00s
Concurrent users: 50
Total requests: 1449
Successful requests: 1449
Failed requests: 0
Success rate: 100.0%
Requests per second: 48.30
---
 .../distributions/k8s-benchmark/stack_run_config.yaml     | 8 ++++++++
 llama_stack/core/routers/inference.py                     | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
index ceb1ba2d9..5a810639e 100644
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
+- safety
 - telemetry
 - tool_runtime
 - vector_io
@@ -30,6 +31,11 @@ providers:
         db: ${env.POSTGRES_DB:=llamastack}
         user: ${env.POSTGRES_USER:=llamastack}
         password: ${env.POSTGRES_PASSWORD:=llamastack}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -95,6 +101,8 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
   provider_id: vllm-inference
   model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 8dcad85e3..045093fe0 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -527,7 +527,7 @@ class InferenceRouter(Inference):
 
         # Store the response with the ID that will be returned to the client
         if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))
 
         if self.telemetry:
             metrics = self._construct_metrics(
@@ -855,4 +855,4 @@ class InferenceRouter(Inference):
                     object="chat.completion",
                 )
                 logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))