From 241e189fee5fdb9a4f944691b39c87d16f6b49ac Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Tue, 4 Nov 2025 16:22:12 +0200 Subject: [PATCH] refactor: address PR feedback - improve naming, error handling, and documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address all feedback from PR #3962: **Code Quality Improvements:** - Rename `_uvicorn_run` → `_run_server` for accurate method naming - Refactor error handling: move Gunicorn fallback logic from `_run_with_gunicorn` to caller - Update comments to reflect both Uvicorn and Gunicorn behavior - Update test mock from `_uvicorn_run` to `_run_server` **Environment Variable:** - Change `LLAMA_STACK_DISABLE_GUNICORN` → `LLAMA_STACK_ENABLE_GUNICORN` - More intuitive positive logic (no double negatives) - Defaults to `true` on Unix systems - Clearer log messages distinguishing platform limitations vs explicit disable **Documentation:** - Remove unnecessary `uv sync --group unit --group test` from user docs - Clarify SQLite limitations: "SQLite only allows one writer at a time" - Accurate explanation: WAL mode enables concurrent reads but writes are serialized - Strong recommendation for PostgreSQL in production with high traffic **Architecture:** - Better separation of concerns: `_run_with_gunicorn` just executes, caller handles fallback - Exceptions propagate to caller for centralized decision making 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../self_hosted_distro/starter.md | 3 +- .../starting_llama_stack_server.mdx | 4 +- docs/docs/providers/agents/index.mdx | 4 +- docs/docs/providers/batches/index.mdx | 24 +++---- docs/docs/providers/eval/index.mdx | 4 +- docs/docs/providers/files/index.mdx | 4 +- docs/docs/providers/inference/index.mdx | 20 +++--- docs/docs/providers/safety/index.mdx | 4 +- src/llama_stack/cli/stack/run.py | 67 +++++++++++-------- tests/unit/cli/test_stack_config.py | 4 +- 10 files changed, 75 insertions(+), 63 deletions(-) diff --git a/docs/docs/distributions/self_hosted_distro/starter.md b/docs/docs/distributions/self_hosted_distro/starter.md index acab6aa32..890e3ea74 100644 --- a/docs/docs/distributions/self_hosted_distro/starter.md +++ b/docs/docs/distributions/self_hosted_distro/starter.md @@ -90,7 +90,8 @@ On Unix-based systems (Linux, macOS), the server automatically uses Gunicorn wit **Important Notes**: - On Windows, the server automatically falls back to single-process Uvicorn. -- **Database Race Condition**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions (e.g., "table already exists" errors) as multiple workers simultaneously attempt to create database tables. To avoid this issue in production, set `GUNICORN_PRELOAD=true` and ensure all dependencies are installed with `uv sync --group unit --group test`. +- **Database Race Condition**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions (e.g., "table already exists" errors) as multiple workers simultaneously attempt to create database tables. To avoid this issue in production, set `GUNICORN_PRELOAD=true`. +- **SQLite with Multiple Workers**: SQLite works with Gunicorn's multi-process mode for development and low-to-moderate traffic scenarios. The system automatically enables WAL (Write-Ahead Logging) mode and sets a 5-second busy timeout. However, **SQLite only allows one writer at a time** - even with WAL mode, write operations from multiple workers are serialized, causing workers to wait for database locks under concurrent write load. **For production deployments with high traffic or multiple workers, we strongly recommend using PostgreSQL or another production-grade database** for true concurrent write performance. **Example production configuration:** ```bash diff --git a/docs/docs/distributions/starting_llama_stack_server.mdx b/docs/docs/distributions/starting_llama_stack_server.mdx index db34d8e66..5e5d0814c 100644 --- a/docs/docs/distributions/starting_llama_stack_server.mdx +++ b/docs/docs/distributions/starting_llama_stack_server.mdx @@ -44,7 +44,9 @@ Configure Gunicorn behavior using environment variables: - `GUNICORN_MAX_REQUESTS_JITTER`: Randomize worker restart timing (default: `1000`) - `GUNICORN_PRELOAD`: Preload app before forking workers for memory efficiency (default: `true`, as set in `run.py` line 264) -**Important**: When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions. To avoid this, set `GUNICORN_PRELOAD=true` and install all dependencies with `uv sync --group unit --group test`. +**Important Notes**: +- When using multiple workers without `GUNICORN_PRELOAD=true`, you may encounter database initialization race conditions. To avoid this, set `GUNICORN_PRELOAD=true`. +- **SQLite with Multiple Workers**: SQLite works with Gunicorn's multi-process mode for development and low-to-moderate traffic scenarios. The system automatically enables WAL (Write-Ahead Logging) mode and sets a 5-second busy timeout. However, **SQLite only allows one writer at a time** - even with WAL mode, write operations from multiple workers are serialized, causing workers to wait for database locks under concurrent write load. **For production deployments with high traffic or multiple workers, we strongly recommend using PostgreSQL or another production-grade database** for true concurrent write performance. **Example production configuration:** ```bash diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx index 06eb104af..52b92734e 100644 --- a/docs/docs/providers/agents/index.mdx +++ b/docs/docs/providers/agents/index.mdx @@ -1,7 +1,7 @@ --- description: "Agents - APIs for creating and interacting with agentic systems." +APIs for creating and interacting with agentic systems." sidebar_label: Agents title: Agents --- @@ -12,6 +12,6 @@ title: Agents Agents - APIs for creating and interacting with agentic systems. +APIs for creating and interacting with agentic systems. This section contains documentation for all available providers for the **agents** API. diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx index 2c64b277f..18e5e314d 100644 --- a/docs/docs/providers/batches/index.mdx +++ b/docs/docs/providers/batches/index.mdx @@ -1,14 +1,14 @@ --- description: "The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes." +Note: This API is currently under active development and may undergo changes." sidebar_label: Batches title: Batches --- @@ -18,14 +18,14 @@ title: Batches ## Overview The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes. +Note: This API is currently under active development and may undergo changes. This section contains documentation for all available providers for the **batches** API. diff --git a/docs/docs/providers/eval/index.mdx b/docs/docs/providers/eval/index.mdx index 94bafe15e..45fc5ebd3 100644 --- a/docs/docs/providers/eval/index.mdx +++ b/docs/docs/providers/eval/index.mdx @@ -1,7 +1,7 @@ --- description: "Evaluations - Llama Stack Evaluation API for running evaluations on model and agent candidates." +Llama Stack Evaluation API for running evaluations on model and agent candidates." sidebar_label: Eval title: Eval --- @@ -12,6 +12,6 @@ title: Eval Evaluations - Llama Stack Evaluation API for running evaluations on model and agent candidates. +Llama Stack Evaluation API for running evaluations on model and agent candidates. This section contains documentation for all available providers for the **eval** API. diff --git a/docs/docs/providers/files/index.mdx b/docs/docs/providers/files/index.mdx index 19e338035..c61c4f1b6 100644 --- a/docs/docs/providers/files/index.mdx +++ b/docs/docs/providers/files/index.mdx @@ -1,7 +1,7 @@ --- description: "Files - This API is used to upload documents that can be used with other Llama Stack APIs." +This API is used to upload documents that can be used with other Llama Stack APIs." sidebar_label: Files title: Files --- @@ -12,6 +12,6 @@ title: Files Files - This API is used to upload documents that can be used with other Llama Stack APIs. +This API is used to upload documents that can be used with other Llama Stack APIs. This section contains documentation for all available providers for the **files** API. diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index 478611420..871acbb00 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -1,12 +1,12 @@ --- description: "Inference - Llama Stack Inference API for generating completions, chat completions, and embeddings. +Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Three kinds of models are supported: - - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents based on their relevance to a query." +This API provides the raw interface to the underlying models. Three kinds of models are supported: +- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search. +- Rerank models: these models reorder the documents based on their relevance to a query." sidebar_label: Inference title: Inference --- @@ -17,11 +17,11 @@ title: Inference Inference - Llama Stack Inference API for generating completions, chat completions, and embeddings. +Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Three kinds of models are supported: - - LLM models: these models generate "raw" and "chat" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents based on their relevance to a query. +This API provides the raw interface to the underlying models. Three kinds of models are supported: +- LLM models: these models generate "raw" and "chat" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search. +- Rerank models: these models reorder the documents based on their relevance to a query. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/docs/providers/safety/index.mdx b/docs/docs/providers/safety/index.mdx index 4e2de4f33..038565475 100644 --- a/docs/docs/providers/safety/index.mdx +++ b/docs/docs/providers/safety/index.mdx @@ -1,7 +1,7 @@ --- description: "Safety - OpenAI-compatible Moderations API." +OpenAI-compatible Moderations API." sidebar_label: Safety title: Safety --- @@ -12,6 +12,6 @@ title: Safety Safety - OpenAI-compatible Moderations API. +OpenAI-compatible Moderations API. This section contains documentation for all available providers for the **safety** API. diff --git a/src/llama_stack/cli/stack/run.py b/src/llama_stack/cli/stack/run.py index 792d6f0f6..4778abc06 100644 --- a/src/llama_stack/cli/stack/run.py +++ b/src/llama_stack/cli/stack/run.py @@ -181,9 +181,15 @@ class StackRun(Subcommand): except AttributeError as e: self.parser.error(f"failed to parse config file '{config_file}':\n {e}") - self._uvicorn_run(config_file, args) + self._run_server(config_file, args) - def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None: + def _run_server(self, config_file: Path | None, args: argparse.Namespace) -> None: + """ + Run the Llama Stack server using either Gunicorn (on Unix systems) or Uvicorn (on Windows or when disabled). + + On Unix systems (Linux/macOS), defaults to Gunicorn with Uvicorn workers for production-grade multi-process + performance. Falls back to single-process Uvicorn on Windows or when LLAMA_STACK_ENABLE_GUNICORN=false. + """ if not config_file: self.parser.error("Config file is required") @@ -229,27 +235,37 @@ class StackRun(Subcommand): logger.info(f"Listening on {host}:{port}") - # We need to catch KeyboardInterrupt because uvicorn's signal handling - # re-raises SIGINT signals using signal.raise_signal(), which Python - # converts to KeyboardInterrupt. Without this catch, we'd get a confusing - # stack trace when using Ctrl+C or kill -2 (SIGINT). - # SIGTERM (kill -15) works fine without this because Python doesn't - # have a default handler for it. - # - # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own - # signal handling but this is quite intrusive and not worth the effort. + # We need to catch KeyboardInterrupt because both Uvicorn and Gunicorn's signal handling + # can raise SIGINT signals, which Python converts to KeyboardInterrupt. Without this catch, + # we'd get a confusing stack trace when using Ctrl+C or kill -2 (SIGINT). + # SIGTERM (kill -15) works fine without this because Python doesn't have a default handler for it. try: - # Check if Gunicorn should be disabled (for testing or debugging) - disable_gunicorn = os.getenv("LLAMA_STACK_DISABLE_GUNICORN", "false").lower() == "true" + # Check if Gunicorn should be enabled + # Default to true on Unix systems, can be disabled via environment variable + enable_gunicorn = os.getenv("LLAMA_STACK_ENABLE_GUNICORN", "true").lower() == "true" and sys.platform in ( + "linux", + "darwin", + ) - if not disable_gunicorn and sys.platform in ("linux", "darwin"): + if enable_gunicorn: # On Unix-like systems, use Gunicorn with Uvicorn workers for production-grade performance - self._run_with_gunicorn(host, port, uvicorn_config) + try: + self._run_with_gunicorn(host, port, uvicorn_config) + except (FileNotFoundError, subprocess.CalledProcessError) as e: + # Gunicorn not available or failed to start - fall back to Uvicorn + logger.warning(f"Gunicorn unavailable or failed to start: {e}") + logger.info("Falling back to single-process Uvicorn server...") + uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config) # type: ignore[arg-type] else: - # On other systems (e.g., Windows), fall back to Uvicorn directly - # Also used when LLAMA_STACK_DISABLE_GUNICORN=true (for tests) - if disable_gunicorn: - logger.info("Gunicorn disabled via LLAMA_STACK_DISABLE_GUNICORN environment variable") + # Fall back to Uvicorn for: + # - Windows systems (Gunicorn not supported) + # - Unix systems with LLAMA_STACK_ENABLE_GUNICORN=false (for testing/debugging) + if sys.platform not in ("linux", "darwin"): + logger.info("Using single-process Uvicorn server (Gunicorn not supported on this platform)") + else: + logger.info( + "Using single-process Uvicorn server (Gunicorn disabled via LLAMA_STACK_ENABLE_GUNICORN=false)" + ) uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config) # type: ignore[arg-type] except (KeyboardInterrupt, SystemExit): logger.info("Received interrupt signal, shutting down gracefully...") @@ -359,16 +375,9 @@ class StackRun(Subcommand): logger.info(f"Worker recycling: every {max_requests}±{max_requests_jitter} requests (prevents memory leaks)") logger.info(f"Total concurrent capacity: {num_workers * worker_connections} connections") - try: - # Execute the Gunicorn command - subprocess.run(gunicorn_command, check=True) - except FileNotFoundError: - logger.error("Error: 'gunicorn' command not found. Please ensure Gunicorn is installed.") - logger.error("Falling back to Uvicorn...") - uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config) # type: ignore[arg-type] - except subprocess.CalledProcessError as e: - logger.error(f"Failed to start Gunicorn server. Error: {e}") - sys.exit(1) + # Execute the Gunicorn command + # If Gunicorn is not found or fails to start, raise the exception for the caller to handle + subprocess.run(gunicorn_command, check=True) def _start_ui_development_server(self, stack_server_port: int): logger.info("Attempting to start UI development server...") diff --git a/tests/unit/cli/test_stack_config.py b/tests/unit/cli/test_stack_config.py index 6aefac003..0e53cf3f8 100644 --- a/tests/unit/cli/test_stack_config.py +++ b/tests/unit/cli/test_stack_config.py @@ -295,8 +295,8 @@ def test_providers_flag_generates_config_with_api_keys(): enable_ui=False, ) - # Mock _uvicorn_run to prevent starting a server - with patch.object(stack_run, "_uvicorn_run"): + # Mock _run_server to prevent starting a server + with patch.object(stack_run, "_run_server"): stack_run._run_stack_run_cmd(args) # Read the generated config file