Merge branch 'main' into fix-ragtool-insert-on-fips-cluster

This commit is contained in:
Jorge 2025-07-02 09:02:14 +02:00 committed by GitHub
commit f62726fd76
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 121 additions and 16 deletions

View file

@ -36,7 +36,7 @@
"from dotenv import load_dotenv\n",
"from llama_stack_client import LlamaStackClient\n",
"from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.custom_tool import CustomTool\n",
"from llama_stack_client.lib.agents.client_tool import ClientTool\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types import CompletionMessage\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
@ -129,7 +129,7 @@
"source": [
"## Step 3: Create a Custom Tool Class\n",
"\n",
"Here, we defines the `WebSearchTool` class, which extends `CustomTool` to integrate the Brave Search API with Llama Stack, enabling web search capabilities within AI workflows. The class handles incoming user queries, interacts with the `BraveSearch` class for data retrieval, and formats results for effective response generation."
"Here, we defines the `WebSearchTool` class, which extends `ClientTool` to integrate the Brave Search API with Llama Stack, enabling web search capabilities within AI workflows. The class handles incoming user queries, interacts with the `BraveSearch` class for data retrieval, and formats results for effective response generation."
]
},
{
@ -139,7 +139,7 @@
"metadata": {},
"outputs": [],
"source": [
"class WebSearchTool(CustomTool):\n",
"class WebSearchTool(ClientTool):\n",
" def __init__(self, api_key: str):\n",
" self.api_key = api_key\n",
" self.engine = BraveSearch(api_key)\n",

View file

@ -2,9 +2,9 @@
Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Providers providing their implementations. These building blocks are assembled into Distributions which are easy for developers to get from zero to production.
This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the memory provider. Please note the steps for configuring your provider and distribution will vary a little depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the VectorIO provider. Please note the steps for configuring your provider and distribution will vary depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from 'Tool Calling' to 'Agents' in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
@ -26,15 +26,15 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
- Follow instructions based on the OS you are on. For example, if you are on a Mac, download and unzip `Ollama-darwin.zip`.
- Run the `Ollama` application.
1. **Download the Ollama CLI**:
2. **Download the Ollama CLI**:
Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
1. **Start ollama server**:
3. **Start ollama server**:
Open the terminal and run:
```
```bash
ollama serve
```
1. **Run the model**:
4. **Run the model**:
Open the terminal and run:
```bash
ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
@ -48,9 +48,9 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
## Install Dependencies and Set Up Environment
1. **Create a Conda Environment**:
Create a new Conda environment with Python 3.10:
Create a new Conda environment with Python 3.12:
```bash
conda create -n ollama python=3.10
conda create -n ollama python=3.12
```
Activate the environment:
```bash

View file

@ -9,7 +9,9 @@ pytest --help
```
Here are the most important options:
- `--stack-config`: specify the stack config to use. You have three ways to point to a stack:
- `--stack-config`: specify the stack config to use. You have four ways to point to a stack:
- **`server:<config>`** - automatically start a server with the given config (e.g., `server:fireworks`). This provides one-step testing by auto-starting the server if the port is available, or reusing an existing server if already running.
- **`server:<config>:<port>`** - same as above but with a custom port (e.g., `server:together:8322`)
- a URL which points to a Llama Stack distribution server
- a template (e.g., `fireworks`, `together`) or a path to a `run.yaml` file
- a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
@ -26,12 +28,39 @@ Model parameters can be influenced by the following options:
Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
if no model is specified.
Experimental, under development, options:
- `--record-responses`: record new API responses instead of using cached ones
## Examples
### Testing against a Server
Run all text inference tests by auto-starting a server with the `fireworks` config:
```bash
pytest -s -v tests/integration/inference/test_text_inference.py \
--stack-config=server:fireworks \
--text-model=meta-llama/Llama-3.1-8B-Instruct
```
Run tests with auto-server startup on a custom port:
```bash
pytest -s -v tests/integration/inference/ \
--stack-config=server:together:8322 \
--text-model=meta-llama/Llama-3.1-8B-Instruct
```
Run multiple test suites with auto-server (eliminates manual server management):
```bash
# Auto-start server and run all integration tests
export FIREWORKS_API_KEY=<your_key>
pytest -s -v tests/integration/inference/ tests/integration/safety/ tests/integration/agents/ \
--stack-config=server:fireworks \
--text-model=meta-llama/Llama-3.1-8B-Instruct
```
### Testing with Library Client
Run all text inference tests with the `together` distribution:
```bash

View file

@ -6,9 +6,13 @@
import inspect
import os
import socket
import subprocess
import tempfile
import time
import pytest
import requests
import yaml
from llama_stack_client import LlamaStackClient
from openai import OpenAI
@ -17,6 +21,44 @@ from llama_stack import LlamaStackAsLibraryClient
from llama_stack.distribution.stack import run_config_from_adhoc_config_spec
from llama_stack.env import get_env_or_fail
DEFAULT_PORT = 8321
def is_port_available(port: int, host: str = "localhost") -> bool:
"""Check if a port is available for binding."""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind((host, port))
return True
except OSError:
return False
def start_llama_stack_server(config_name: str) -> subprocess.Popen:
"""Start a llama stack server with the given config."""
cmd = ["llama", "stack", "run", config_name]
# Start server in background
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return process
def wait_for_server_ready(base_url: str, timeout: int = 120) -> bool:
"""Wait for the server to be ready by polling the health endpoint."""
health_url = f"{base_url}/v1/health"
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(health_url, timeout=5)
if response.status_code == 200:
return True
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
pass
time.sleep(0.5)
return False
@pytest.fixture(scope="session")
def provider_data():
@ -122,6 +164,40 @@ def llama_stack_client(request, provider_data):
if not config:
raise ValueError("You must specify either --stack-config or LLAMA_STACK_CONFIG")
# Handle server:<config_name> format or server:<config_name>:<port>
if config.startswith("server:"):
parts = config.split(":")
config_name = parts[1]
port = int(parts[2]) if len(parts) > 2 else int(os.environ.get("LLAMA_STACK_PORT", DEFAULT_PORT))
base_url = f"http://localhost:{port}"
# Check if port is available
if is_port_available(port):
print(f"Starting llama stack server with config '{config_name}' on port {port}...")
# Start server
server_process = start_llama_stack_server(config_name)
# Wait for server to be ready
if not wait_for_server_ready(base_url, timeout=120):
print("Server failed to start within timeout")
server_process.terminate()
raise RuntimeError(
f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid."
)
print(f"Server is ready at {base_url}")
# Store process for potential cleanup (pytest will handle termination at session end)
request.session._llama_stack_server_process = server_process
else:
print(f"Port {port} is already in use, assuming server is already running...")
return LlamaStackClient(
base_url=base_url,
provider_data=provider_data,
)
# check if this looks like a URL
if config.startswith("http") or "//" in config:
return LlamaStackClient(