diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index 2416f857e..933cfe963 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -316,9 +316,6 @@ class OpenAIResponsesImpl: if final_response is None: raise ValueError("The response stream never reached a terminal state") - - # Flush any queued writes to ensure immediate visibility - await self.responses_store.flush() return final_response async def _create_streaming_response( diff --git a/src/llama_stack/providers/utils/inference/inference_store.py b/src/llama_stack/providers/utils/inference/inference_store.py index 2a2de3b84..2bf947a8d 100644 --- a/src/llama_stack/providers/utils/inference/inference_store.py +++ b/src/llama_stack/providers/utils/inference/inference_store.py @@ -16,12 +16,12 @@ from llama_stack.apis.inference import ( Order, ) from llama_stack.core.datatypes import AccessRule -from llama_stack.core.storage.datatypes import InferenceStoreReference +from llama_stack.core.storage.datatypes import InferenceStoreReference, StorageBackendType from llama_stack.log import get_logger from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore -from ..sqlstore.sqlstore import sqlstore_impl +from ..sqlstore.sqlstore import _SQLSTORE_BACKENDS, sqlstore_impl logger = get_logger(name=__name__, category="inference") @@ -48,6 +48,13 @@ class InferenceStore: base_store = sqlstore_impl(self.reference) self.sql_store = AuthorizedSqlStore(base_store, self.policy) + # Disable write queue for SQLite since WAL mode handles concurrency + # Keep it enabled for other backends (like Postgres) for performance + backend_config = _SQLSTORE_BACKENDS.get(self.reference.backend) + if backend_config and backend_config.type == StorageBackendType.SQL_SQLITE: + self.enable_write_queue = False + logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)") + await self.sql_store.create_table( "chat_completions", { diff --git a/src/llama_stack/providers/utils/responses/responses_store.py b/src/llama_stack/providers/utils/responses/responses_store.py index 4a4242d14..40466d00c 100644 --- a/src/llama_stack/providers/utils/responses/responses_store.py +++ b/src/llama_stack/providers/utils/responses/responses_store.py @@ -19,12 +19,12 @@ from llama_stack.apis.agents.openai_responses import ( ) from llama_stack.apis.inference import OpenAIMessageParam from llama_stack.core.datatypes import AccessRule -from llama_stack.core.storage.datatypes import ResponsesStoreReference, SqlStoreReference +from llama_stack.core.storage.datatypes import ResponsesStoreReference, SqlStoreReference, StorageBackendType from llama_stack.log import get_logger from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore -from ..sqlstore.sqlstore import sqlstore_impl +from ..sqlstore.sqlstore import _SQLSTORE_BACKENDS, sqlstore_impl logger = get_logger(name=__name__, category="openai_responses") @@ -70,6 +70,13 @@ class ResponsesStore: base_store = sqlstore_impl(self.reference) self.sql_store = AuthorizedSqlStore(base_store, self.policy) + # Disable write queue for SQLite since WAL mode handles concurrency + # Keep it enabled for other backends (like Postgres) for performance + backend_config = _SQLSTORE_BACKENDS.get(self.reference.backend) + if backend_config and backend_config.type == StorageBackendType.SQL_SQLITE: + self.enable_write_queue = False + logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)") + await self.sql_store.create_table( "openai_responses", {