diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml index ceb1ba2d9..5a810639e 100644 --- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml +++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml @@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo apis: - agents - inference +- safety - telemetry - tool_runtime - vector_io @@ -30,6 +31,11 @@ providers: db: ${env.POSTGRES_DB:=llamastack} user: ${env.POSTGRES_USER:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -95,6 +101,8 @@ models: - model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 8dcad85e3..045093fe0 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -527,7 +527,7 @@ class InferenceRouter(Inference): # Store the response with the ID that will be returned to the client if self.store: - await self.store.store_chat_completion(response, messages) + asyncio.create_task(self.store.store_chat_completion(response, messages)) if self.telemetry: metrics = self._construct_metrics( @@ -855,4 +855,4 @@ class InferenceRouter(Inference): object="chat.completion", ) logger.debug(f"InferenceRouter.completion_response: {final_response}") - await self.store.store_chat_completion(final_response, messages) + asyncio.create_task(self.store.store_chat_completion(final_response, messages))