From 958600a5c18450606da0cd5af3c1bcd53cdb13d5 Mon Sep 17 00:00:00 2001
From: Wen Zhou <wenzhou@redhat.com>
Date: Tue, 1 Jul 2025 20:08:55 +0200
Subject: [PATCH 1/2] fix: update zero_to_hero package and README (#2578)

# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
- update REAMDE.md format and python version
- update package name: CustomTool was renamed to ClientTool in
https://github.com/meta-llama/llama-stack-client-python/pull/73


<!-- If resolving an issue, uncomment and update the line below -->
Closes #2556

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->

Signed-off-by: Wen Zhou <wenzhou@redhat.com>
---
 docs/zero_to_hero_guide/04_Tool_Calling101.ipynb |  6 +++---
 docs/zero_to_hero_guide/README.md                | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
index de3754b21..00a427b07 100644
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
@@ -36,7 +36,7 @@
         "from dotenv import load_dotenv\n",
         "from llama_stack_client import LlamaStackClient\n",
         "from llama_stack_client.lib.agents.agent import Agent\n",
-        "from llama_stack_client.lib.agents.custom_tool import CustomTool\n",
+        "from llama_stack_client.lib.agents.client_tool import ClientTool\n",
         "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
         "from llama_stack_client.types import CompletionMessage\n",
         "from llama_stack_client.types.agent_create_params import AgentConfig\n",
@@ -129,7 +129,7 @@
       "source": [
         "## Step 3: Create a Custom Tool Class\n",
         "\n",
-        "Here, we defines the `WebSearchTool` class, which extends `CustomTool` to integrate the Brave Search API with Llama Stack, enabling web search capabilities within AI workflows. The class handles incoming user queries, interacts with the `BraveSearch` class for data retrieval, and formats results for effective response generation."
+        "Here, we defines the `WebSearchTool` class, which extends `ClientTool` to integrate the Brave Search API with Llama Stack, enabling web search capabilities within AI workflows. The class handles incoming user queries, interacts with the `BraveSearch` class for data retrieval, and formats results for effective response generation."
       ]
     },
     {
@@ -139,7 +139,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "class WebSearchTool(CustomTool):\n",
+        "class WebSearchTool(ClientTool):\n",
         "    def __init__(self, api_key: str):\n",
         "        self.api_key = api_key\n",
         "        self.engine = BraveSearch(api_key)\n",
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 96f9768de..a891aa343 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -2,9 +2,9 @@
 
 Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Providers providing their implementations. These building blocks are assembled into Distributions which are easy for developers to get from zero to production.
 
-This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the memory provider. Please note the steps for configuring your provider and distribution will vary a little depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
+This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the VectorIO provider. Please note the steps for configuring your provider and distribution will vary depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
 
-If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
+If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from 'Tool Calling' to 'Agents' in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
 
 > If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
 
@@ -26,15 +26,15 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
    - Follow instructions based on the OS you are on. For example, if you are on a Mac, download and unzip `Ollama-darwin.zip`.
    - Run the `Ollama` application.
 
-1. **Download the Ollama CLI**:
+2. **Download the Ollama CLI**:
    Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
 
-1. **Start ollama server**:
+3. **Start ollama server**:
    Open the terminal and run:
-   ```
+   ```bash
    ollama serve
    ```
-1. **Run the model**:
+4. **Run the model**:
    Open the terminal and run:
    ```bash
    ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
@@ -48,9 +48,9 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 ## Install Dependencies and Set Up Environment
 
 1. **Create a Conda Environment**:
-   Create a new Conda environment with Python 3.10:
+   Create a new Conda environment with Python 3.12:
    ```bash
-   conda create -n ollama python=3.10
+   conda create -n ollama python=3.12
    ```
    Activate the environment:
    ```bash

From fc735a414eb013f8e9342c6a1584728c9d02ea4a Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:48:46 -0700
Subject: [PATCH 2/2] test: Add one-step integration testing with server
 auto-start (#2580)

## Summary

Add support for `server:<config>` format in `--stack-config` option to
enable seamless one-step integration testing. This eliminates the need
to manually start servers in separate terminals before running tests.

## Key Features

- **Auto-start server**: Automatically launches `llama stack run
<config>` if target port is available
- **Smart reuse**: Reuses existing server if port is already occupied
- **Health check polling**: Waits up to 2 minutes for server readiness
via `/v1/health` endpoint
- **Custom port support**: Use `server:<config>:<port>` for non-default
ports
- **Clean output**: Server runs quietly in background without cluttering
test output
- **Backward compatibility**: All existing `--stack-config` formats
continue to work

## Usage Examples

```bash
# Auto-start server with default port 8321
pytest tests/integration/inference/ --stack-config=server:fireworks

# Use custom port
pytest tests/integration/safety/ --stack-config=server:together:8322

# Run multiple test suites seamlessly
pytest tests/integration/inference/ tests/integration/agents/ --stack-config=server:starter
```

## Implementation Details

- Enhanced `llama_stack_client` fixture with server management
- Updated documentation with cleaner organization and comprehensive
examples
- Added utility functions for port checking, server startup, and health
verification

## Test Plan

- Verified server auto-start when port 8321 is available
- Verified server reuse when port 8321 is occupied
- Tested health check polling via `/v1/health` endpoint
- Confirmed custom port configuration works correctly
- Verified backward compatibility with existing config formats

## Before/After Comparison

**Before (2 steps):**
```bash
# Terminal 1: Start server manually
llama stack run fireworks --port 8321

# Terminal 2: Wait for startup, then run tests
pytest tests/integration/inference/ --stack-config=http://localhost:8321
```

**After (1 step):**
```bash
# Single command handles everything
pytest tests/integration/inference/ --stack-config=server:fireworks
```
---
 tests/integration/README.md          | 39 ++++++++++++--
 tests/integration/fixtures/common.py | 76 ++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index 31d58c83f..fc8612139 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -9,7 +9,9 @@ pytest --help
 ```
 
 Here are the most important options:
-- `--stack-config`: specify the stack config to use. You have three ways to point to a stack:
+- `--stack-config`: specify the stack config to use. You have four ways to point to a stack:
+  - **`server:<config>`** - automatically start a server with the given config (e.g., `server:fireworks`). This provides one-step testing by auto-starting the server if the port is available, or reusing an existing server if already running.
+  - **`server:<config>:<port>`** - same as above but with a custom port (e.g., `server:together:8322`)
   - a URL which points to a Llama Stack distribution server
   - a template (e.g., `fireworks`, `together`) or a path to a `run.yaml` file
   - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
@@ -26,12 +28,39 @@ Model parameters can be influenced by the following options:
 Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
 if no model is specified.
 
-Experimental, under development, options:
-- `--record-responses`: record new API responses instead of using cached ones
-
-
 ## Examples
 
+### Testing against a Server
+
+Run all text inference tests by auto-starting a server with the `fireworks` config:
+
+```bash
+pytest -s -v tests/integration/inference/test_text_inference.py \
+   --stack-config=server:fireworks \
+   --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+Run tests with auto-server startup on a custom port:
+
+```bash
+pytest -s -v tests/integration/inference/ \
+   --stack-config=server:together:8322 \
+   --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+Run multiple test suites with auto-server (eliminates manual server management):
+
+```bash
+# Auto-start server and run all integration tests
+export FIREWORKS_API_KEY=<your_key>
+
+pytest -s -v tests/integration/inference/ tests/integration/safety/ tests/integration/agents/ \
+   --stack-config=server:fireworks \
+   --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+### Testing with Library Client
+
 Run all text inference tests with the `together` distribution:
 
 ```bash
diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index 8b6b3ddbe..2d6092e44 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -6,9 +6,13 @@
 
 import inspect
 import os
+import socket
+import subprocess
 import tempfile
+import time
 
 import pytest
+import requests
 import yaml
 from llama_stack_client import LlamaStackClient
 from openai import OpenAI
@@ -17,6 +21,44 @@ from llama_stack import LlamaStackAsLibraryClient
 from llama_stack.distribution.stack import run_config_from_adhoc_config_spec
 from llama_stack.env import get_env_or_fail
 
+DEFAULT_PORT = 8321
+
+
+def is_port_available(port: int, host: str = "localhost") -> bool:
+    """Check if a port is available for binding."""
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.bind((host, port))
+            return True
+    except OSError:
+        return False
+
+
+def start_llama_stack_server(config_name: str) -> subprocess.Popen:
+    """Start a llama stack server with the given config."""
+    cmd = ["llama", "stack", "run", config_name]
+
+    # Start server in background
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return process
+
+
+def wait_for_server_ready(base_url: str, timeout: int = 120) -> bool:
+    """Wait for the server to be ready by polling the health endpoint."""
+    health_url = f"{base_url}/v1/health"
+    start_time = time.time()
+
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(health_url, timeout=5)
+            if response.status_code == 200:
+                return True
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+            pass
+        time.sleep(0.5)
+
+    return False
+
 
 @pytest.fixture(scope="session")
 def provider_data():
@@ -122,6 +164,40 @@ def llama_stack_client(request, provider_data):
     if not config:
         raise ValueError("You must specify either --stack-config or LLAMA_STACK_CONFIG")
 
+    # Handle server:<config_name> format or server:<config_name>:<port>
+    if config.startswith("server:"):
+        parts = config.split(":")
+        config_name = parts[1]
+        port = int(parts[2]) if len(parts) > 2 else int(os.environ.get("LLAMA_STACK_PORT", DEFAULT_PORT))
+        base_url = f"http://localhost:{port}"
+
+        # Check if port is available
+        if is_port_available(port):
+            print(f"Starting llama stack server with config '{config_name}' on port {port}...")
+
+            # Start server
+            server_process = start_llama_stack_server(config_name)
+
+            # Wait for server to be ready
+            if not wait_for_server_ready(base_url, timeout=120):
+                print("Server failed to start within timeout")
+                server_process.terminate()
+                raise RuntimeError(
+                    f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid."
+                )
+
+            print(f"Server is ready at {base_url}")
+
+            # Store process for potential cleanup (pytest will handle termination at session end)
+            request.session._llama_stack_server_process = server_process
+        else:
+            print(f"Port {port} is already in use, assuming server is already running...")
+
+        return LlamaStackClient(
+            base_url=base_url,
+            provider_data=provider_data,
+        )
+
     # check if this looks like a URL
     if config.startswith("http") or "//" in config:
         return LlamaStackClient(