From 241e189fee5fdb9a4f944691b39c87d16f6b49ac Mon Sep 17 00:00:00 2001
From: Roy Belio <rbelio@redhat.com>
Date: Tue, 4 Nov 2025 16:22:12 +0200
Subject: [PATCH] refactor: address PR feedback - improve naming, error
 handling, and documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address all feedback from PR #3962:

**Code Quality Improvements:**
- Rename `_uvicorn_run` → `_run_server` for accurate method naming
- Refactor error handling: move Gunicorn fallback logic from `_run_with_gunicorn` to caller
- Update comments to reflect both Uvicorn and Gunicorn behavior
- Update test mock from `_uvicorn_run` to `_run_server`

**Environment Variable:**
- Change `LLAMA_STACK_DISABLE_GUNICORN` → `LLAMA_STACK_ENABLE_GUNICORN`
- More intuitive positive logic (no double negatives)
- Defaults to `true` on Unix systems
- Clearer log messages distinguishing platform limitations vs explicit disable

**Documentation:**
- Remove unnecessary `uv sync --group unit --group test` from user docs
- Clarify SQLite limitations: "SQLite only allows one writer at a time"
- Accurate explanation: WAL mode enables concurrent reads but writes are serialized
- Strong recommendation for PostgreSQL in production with high traffic

**Architecture:**
- Better separation of concerns: `_run_with_gunicorn` just executes, caller handles fallback
- Exceptions propagate to caller for centralized decision making

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../self_hosted_distro/starter.md             |  3 +-
 .../starting_llama_stack_server.mdx           |  4 +-
 docs/docs/providers/agents/index.mdx          |  4 +-
 docs/docs/providers/batches/index.mdx         | 24 +++----
 docs/docs/providers/eval/index.mdx            |  4 +-
 docs/docs/providers/files/index.mdx           |  4 +-
 docs/docs/providers/inference/index.mdx       | 20 +++---
 docs/docs/providers/safety/index.mdx          |  4 +-
 src/llama_stack/cli/stack/run.py              | 67 +++++++++++--------
 tests/unit/cli/test_stack_config.py           |  4 +-
 10 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/docs/docs/distributions/self_hosted_distro/starter.md b/docs/docs/distributions/self_hosted_distro/starter.md
index acab6aa32..890e3ea74 100644
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@@ -90,7 +90,8 @@ On Unix-based systems (Linux, macOS), the server automatically uses Gunicorn wit
 **Important Notes**:
 
 - On Windows, the server automatically falls back to single-process Uvicorn.
-- **Database Race Condition**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions (e.g., "table already exists" errors) as multiple workers simultaneously attempt to create database tables. To avoid this issue in production, set `GUNICORN_PRELOAD=true` and ensure all dependencies are installed with `uv sync --group unit --group test`.
+- **Database Race Condition**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions (e.g., "table already exists" errors) as multiple workers simultaneously attempt to create database tables. To avoid this issue in production, set `GUNICORN_PRELOAD=true`.
+- **SQLite with Multiple Workers**: SQLite works with Gunicorn's multi-process mode for development and low-to-moderate traffic scenarios. The system automatically enables WAL (Write-Ahead Logging) mode and sets a 5-second busy timeout. However, **SQLite only allows one writer at a time** - even with WAL mode, write operations from multiple workers are serialized, causing workers to wait for database locks under concurrent write load. **For production deployments with high traffic or multiple workers, we strongly recommend using PostgreSQL or another production-grade database** for true concurrent write performance.
 
 **Example production configuration:**
 ```bash
diff --git a/docs/docs/distributions/starting_llama_stack_server.mdx b/docs/docs/distributions/starting_llama_stack_server.mdx
index db34d8e66..5e5d0814c 100644
--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@@ -44,7 +44,9 @@ Configure Gunicorn behavior using environment variables:
 - `GUNICORN_MAX_REQUESTS_JITTER`: Randomize worker restart timing (default: `1000`)
 - `GUNICORN_PRELOAD`: Preload app before forking workers for memory efficiency (default: `true`, as set in `run.py` line 264)
 
-**Important**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions. To avoid this, set `GUNICORN_PRELOAD=true` and install all dependencies with `uv sync --group unit --group test`.
+**Important Notes**:
+- When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions. To avoid this, set `GUNICORN_PRELOAD=true`.
+- **SQLite with Multiple Workers**: SQLite works with Gunicorn's multi-process mode for development and low-to-moderate traffic scenarios. The system automatically enables WAL (Write-Ahead Logging) mode and sets a 5-second busy timeout. However, **SQLite only allows one writer at a time** - even with WAL mode, write operations from multiple workers are serialized, causing workers to wait for database locks under concurrent write load. **For production deployments with high traffic or multiple workers, we strongly recommend using PostgreSQL or another production-grade database** for true concurrent write performance.
 
 **Example production configuration:**
 ```bash
diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx
index 06eb104af..52b92734e 100644
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Agents
 
-    APIs for creating and interacting with agentic systems."
+APIs for creating and interacting with agentic systems."
 sidebar_label: Agents
 title: Agents
 ---
@@ -12,6 +12,6 @@ title: Agents
 
 Agents
 
-    APIs for creating and interacting with agentic systems.
+APIs for creating and interacting with agentic systems.
 
 This section contains documentation for all available providers for the **agents** API.
diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx
index 2c64b277f..18e5e314d 100644
--- a/docs/docs/providers/batches/index.mdx
+++ b/docs/docs/providers/batches/index.mdx
@@ -1,14 +1,14 @@
 ---
 description: "The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
+particularly useful for processing large datasets, batch evaluation workflows, and
+cost-effective inference at scale.
 
-    The API is designed to allow use of openai client libraries for seamless integration.
+The API is designed to allow use of openai client libraries for seamless integration.
 
-    This API provides the following extensions:
-     - idempotent batch creation
+This API provides the following extensions:
+ - idempotent batch creation
 
-    Note: This API is currently under active development and may undergo changes."
+Note: This API is currently under active development and may undergo changes."
 sidebar_label: Batches
 title: Batches
 ---
@@ -18,14 +18,14 @@ title: Batches
 ## Overview
 
 The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
+particularly useful for processing large datasets, batch evaluation workflows, and
+cost-effective inference at scale.
 
-    The API is designed to allow use of openai client libraries for seamless integration.
+The API is designed to allow use of openai client libraries for seamless integration.
 
-    This API provides the following extensions:
-     - idempotent batch creation
+This API provides the following extensions:
+ - idempotent batch creation
 
-    Note: This API is currently under active development and may undergo changes.
+Note: This API is currently under active development and may undergo changes.
 
 This section contains documentation for all available providers for the **batches** API.
diff --git a/docs/docs/providers/eval/index.mdx b/docs/docs/providers/eval/index.mdx
index 94bafe15e..45fc5ebd3 100644
--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Evaluations
 
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."
+Llama Stack Evaluation API for running evaluations on model and agent candidates."
 sidebar_label: Eval
 title: Eval
 ---
@@ -12,6 +12,6 @@ title: Eval
 
 Evaluations
 
-    Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Llama Stack Evaluation API for running evaluations on model and agent candidates.
 
 This section contains documentation for all available providers for the **eval** API.
diff --git a/docs/docs/providers/files/index.mdx b/docs/docs/providers/files/index.mdx
index 19e338035..c61c4f1b6 100644
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Files
 
-    This API is used to upload documents that can be used with other Llama Stack APIs."
+This API is used to upload documents that can be used with other Llama Stack APIs."
 sidebar_label: Files
 title: Files
 ---
@@ -12,6 +12,6 @@ title: Files
 
 Files
 
-    This API is used to upload documents that can be used with other Llama Stack APIs.
+This API is used to upload documents that can be used with other Llama Stack APIs.
 
 This section contains documentation for all available providers for the **files** API.
diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx
index 478611420..871acbb00 100644
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@@ -1,12 +1,12 @@
 ---
 description: "Inference
 
-    Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
-    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query."
+This API provides the raw interface to the underlying models. Three kinds of models are supported:
+- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
+- Embedding models: these models generate embeddings to be used for semantic search.
+- Rerank models: these models reorder the documents based on their relevance to a query."
 sidebar_label: Inference
 title: Inference
 ---
@@ -17,11 +17,11 @@ title: Inference
 
 Inference
 
-    Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.
+This API provides the raw interface to the underlying models. Three kinds of models are supported:
+- LLM models: these models generate "raw" and "chat" (conversational) completions.
+- Embedding models: these models generate embeddings to be used for semantic search.
+- Rerank models: these models reorder the documents based on their relevance to a query.
 
 This section contains documentation for all available providers for the **inference** API.
diff --git a/docs/docs/providers/safety/index.mdx b/docs/docs/providers/safety/index.mdx
index 4e2de4f33..038565475 100644
--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Safety
 
-    OpenAI-compatible Moderations API."
+OpenAI-compatible Moderations API."
 sidebar_label: Safety
 title: Safety
 ---
@@ -12,6 +12,6 @@ title: Safety
 
 Safety
 
-    OpenAI-compatible Moderations API.
+OpenAI-compatible Moderations API.
 
 This section contains documentation for all available providers for the **safety** API.
diff --git a/src/llama_stack/cli/stack/run.py b/src/llama_stack/cli/stack/run.py
index 792d6f0f6..4778abc06 100644
--- a/src/llama_stack/cli/stack/run.py
+++ b/src/llama_stack/cli/stack/run.py
@@ -181,9 +181,15 @@ class StackRun(Subcommand):
             except AttributeError as e:
                 self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
 
-        self._uvicorn_run(config_file, args)
+        self._run_server(config_file, args)
 
-    def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
+    def _run_server(self, config_file: Path | None, args: argparse.Namespace) -> None:
+        """
+        Run the Llama Stack server using either Gunicorn (on Unix systems) or Uvicorn (on Windows or when disabled).
+
+        On Unix systems (Linux/macOS), defaults to Gunicorn with Uvicorn workers for production-grade multi-process
+        performance. Falls back to single-process Uvicorn on Windows or when LLAMA_STACK_ENABLE_GUNICORN=false.
+        """
         if not config_file:
             self.parser.error("Config file is required")
 
@@ -229,27 +235,37 @@ class StackRun(Subcommand):
 
         logger.info(f"Listening on {host}:{port}")
 
-        # We need to catch KeyboardInterrupt because uvicorn's signal handling
-        # re-raises SIGINT signals using signal.raise_signal(), which Python
-        # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
-        # stack trace when using Ctrl+C or kill -2 (SIGINT).
-        # SIGTERM (kill -15) works fine without this because Python doesn't
-        # have a default handler for it.
-        #
-        # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
-        # signal handling but this is quite intrusive and not worth the effort.
+        # We need to catch KeyboardInterrupt because both Uvicorn and Gunicorn's signal handling
+        # can raise SIGINT signals, which Python converts to KeyboardInterrupt. Without this catch,
+        # we'd get a confusing stack trace when using Ctrl+C or kill -2 (SIGINT).
+        # SIGTERM (kill -15) works fine without this because Python doesn't have a default handler for it.
         try:
-            # Check if Gunicorn should be disabled (for testing or debugging)
-            disable_gunicorn = os.getenv("LLAMA_STACK_DISABLE_GUNICORN", "false").lower() == "true"
+            # Check if Gunicorn should be enabled
+            # Default to true on Unix systems, can be disabled via environment variable
+            enable_gunicorn = os.getenv("LLAMA_STACK_ENABLE_GUNICORN", "true").lower() == "true" and sys.platform in (
+                "linux",
+                "darwin",
+            )
 
-            if not disable_gunicorn and sys.platform in ("linux", "darwin"):
+            if enable_gunicorn:
                 # On Unix-like systems, use Gunicorn with Uvicorn workers for production-grade performance
-                self._run_with_gunicorn(host, port, uvicorn_config)
+                try:
+                    self._run_with_gunicorn(host, port, uvicorn_config)
+                except (FileNotFoundError, subprocess.CalledProcessError) as e:
+                    # Gunicorn not available or failed to start - fall back to Uvicorn
+                    logger.warning(f"Gunicorn unavailable or failed to start: {e}")
+                    logger.info("Falling back to single-process Uvicorn server...")
+                    uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)  # type: ignore[arg-type]
             else:
-                # On other systems (e.g., Windows), fall back to Uvicorn directly
-                # Also used when LLAMA_STACK_DISABLE_GUNICORN=true (for tests)
-                if disable_gunicorn:
-                    logger.info("Gunicorn disabled via LLAMA_STACK_DISABLE_GUNICORN environment variable")
+                # Fall back to Uvicorn for:
+                # - Windows systems (Gunicorn not supported)
+                # - Unix systems with LLAMA_STACK_ENABLE_GUNICORN=false (for testing/debugging)
+                if sys.platform not in ("linux", "darwin"):
+                    logger.info("Using single-process Uvicorn server (Gunicorn not supported on this platform)")
+                else:
+                    logger.info(
+                        "Using single-process Uvicorn server (Gunicorn disabled via LLAMA_STACK_ENABLE_GUNICORN=false)"
+                    )
                 uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)  # type: ignore[arg-type]
         except (KeyboardInterrupt, SystemExit):
             logger.info("Received interrupt signal, shutting down gracefully...")
@@ -359,16 +375,9 @@ class StackRun(Subcommand):
         logger.info(f"Worker recycling: every {max_requests}±{max_requests_jitter} requests (prevents memory leaks)")
         logger.info(f"Total concurrent capacity: {num_workers * worker_connections} connections")
 
-        try:
-            # Execute the Gunicorn command
-            subprocess.run(gunicorn_command, check=True)
-        except FileNotFoundError:
-            logger.error("Error: 'gunicorn' command not found. Please ensure Gunicorn is installed.")
-            logger.error("Falling back to Uvicorn...")
-            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)  # type: ignore[arg-type]
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Failed to start Gunicorn server. Error: {e}")
-            sys.exit(1)
+        # Execute the Gunicorn command
+        # If Gunicorn is not found or fails to start, raise the exception for the caller to handle
+        subprocess.run(gunicorn_command, check=True)
 
     def _start_ui_development_server(self, stack_server_port: int):
         logger.info("Attempting to start UI development server...")
diff --git a/tests/unit/cli/test_stack_config.py b/tests/unit/cli/test_stack_config.py
index 6aefac003..0e53cf3f8 100644
--- a/tests/unit/cli/test_stack_config.py
+++ b/tests/unit/cli/test_stack_config.py
@@ -295,8 +295,8 @@ def test_providers_flag_generates_config_with_api_keys():
         enable_ui=False,
     )
 
-    # Mock _uvicorn_run to prevent starting a server
-    with patch.object(stack_run, "_uvicorn_run"):
+    # Mock _run_server to prevent starting a server
+    with patch.object(stack_run, "_run_server"):
         stack_run._run_stack_run_cmd(args)
 
     # Read the generated config file