From 554d9589312dfb5a371d569d362f6dc08688dd1a Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 3 Nov 2025 14:34:23 -0800 Subject: [PATCH] fix: keep write queues enabled, flush before returning non-streaming responses Keep write queues enabled for all backends (simplicity + performance). WAL mode handles SQLite concurrency without locking. Flush queued writes before returning in non-streaming mode to ensure immediate visibility for callers who expect synchronous behavior. --- .../agents/meta_reference/responses/openai_responses.py | 3 +++ .../providers/utils/inference/inference_store.py | 7 ------- .../providers/utils/responses/responses_store.py | 7 ------- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index 933cfe963..2416f857e 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -316,6 +316,9 @@ class OpenAIResponsesImpl: if final_response is None: raise ValueError("The response stream never reached a terminal state") + + # Flush any queued writes to ensure immediate visibility + await self.responses_store.flush() return final_response async def _create_streaming_response( diff --git a/src/llama_stack/providers/utils/inference/inference_store.py b/src/llama_stack/providers/utils/inference/inference_store.py index 2bf947a8d..003adfddb 100644 --- a/src/llama_stack/providers/utils/inference/inference_store.py +++ b/src/llama_stack/providers/utils/inference/inference_store.py @@ -48,13 +48,6 @@ class InferenceStore: base_store = sqlstore_impl(self.reference) self.sql_store = AuthorizedSqlStore(base_store, self.policy) - # Disable write queue for SQLite since WAL mode handles concurrency - # Keep it enabled for other backends (like Postgres) for performance - backend_config = _SQLSTORE_BACKENDS.get(self.reference.backend) - if backend_config and backend_config.type == StorageBackendType.SQL_SQLITE: - self.enable_write_queue = False - logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)") - await self.sql_store.create_table( "chat_completions", { diff --git a/src/llama_stack/providers/utils/responses/responses_store.py b/src/llama_stack/providers/utils/responses/responses_store.py index 40466d00c..5cdca0488 100644 --- a/src/llama_stack/providers/utils/responses/responses_store.py +++ b/src/llama_stack/providers/utils/responses/responses_store.py @@ -70,13 +70,6 @@ class ResponsesStore: base_store = sqlstore_impl(self.reference) self.sql_store = AuthorizedSqlStore(base_store, self.policy) - # Disable write queue for SQLite since WAL mode handles concurrency - # Keep it enabled for other backends (like Postgres) for performance - backend_config = _SQLSTORE_BACKENDS.get(self.reference.backend) - if backend_config and backend_config.type == StorageBackendType.SQL_SQLITE: - self.enable_write_queue = False - logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)") - await self.sql_store.create_table( "openai_responses", {