llama-stack-mirror/scripts/integration-tests.sh
Emilio Garcia 7da733091a
feat!: Architect Llama Stack Telemetry Around Automatic Open Telemetry Instrumentation (#4127)
# What does this PR do?
Fixes: https://github.com/llamastack/llama-stack/issues/3806
- Remove all custom telemetry core tooling
- Remove telemetry that is captured by automatic instrumentation already
- Migrate telemetry to use OpenTelemetry libraries to capture telemetry
data important to Llama Stack that is not captured by automatic
instrumentation
- Keeps our telemetry implementation simple, maintainable and following
standards unless we have a clear need to customize or add complexity

## Test Plan

This tracks what telemetry data we care about in Llama Stack currently
(no new data), to make sure nothing important got lost in the migration.
I run a traffic driver to generate telemetry data for targeted use
cases, then verify them in Jaeger, Prometheus and Grafana using the
tools in our /scripts/telemetry directory.

### Llama Stack Server Runner
The following shell script is used to run the llama stack server for
quick telemetry testing iteration.

```sh
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_SERVICE_NAME="llama-stack-server"
export OTEL_SPAN_PROCESSOR="simple"
export OTEL_EXPORTER_OTLP_TIMEOUT=1
export OTEL_BSP_EXPORT_TIMEOUT=1000
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"

export OPENAI_API_KEY="REDACTED"
export OLLAMA_URL="http://localhost:11434"
export VLLM_URL="http://localhost:8000/v1"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument llama stack run starter
```

### Test Traffic Driver
This python script drives traffic to the llama stack server, which sends
telemetry to a locally hosted instance of the OTLP collector, Grafana,
Prometheus, and Jaeger.

```sh
export OTEL_SERVICE_NAME="openai-client"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:4318"

export GITHUB_TOKEN="REDACTED"

export MLFLOW_TRACKING_URI="http://127.0.0.1:5001"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument python main.py
```

```python

from openai import OpenAI
import os
import requests

def main():

    github_token = os.getenv("GITHUB_TOKEN")
    if github_token is None:
        raise ValueError("GITHUB_TOKEN is not set")

    client = OpenAI(
        api_key="fake",
        base_url="http://localhost:8321/v1/",
    )

    response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}]
    )
    print("Sync response: ", response.choices[0].message.content)

    streaming_response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}],
        stream=True,
        stream_options={"include_usage": True}
    )

    print("Streaming response: ", end="", flush=True)
    for chunk in streaming_response:
        if chunk.usage is not None:
            print("Usage: ", chunk.usage)
        if chunk.choices and chunk.choices[0].delta is not None:
            print(chunk.choices[0].delta.content, end="", flush=True)
    print()

    ollama_response = client.chat.completions.create(
        model="ollama/llama3.2:3b-instruct-fp16",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("Ollama response: ", ollama_response.choices[0].message.content)

    vllm_response = client.chat.completions.create(
        model="vllm/Qwen/Qwen3-0.6B",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("VLLM response: ", vllm_response.choices[0].message.content)

    responses_list_tools_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "What tools are available?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses list tools response: ", responses_list_tools_response.output_text)

    responses_tool_call_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "How many repositories does the token have access to?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses tool call response: ", responses_tool_call_response.output_text)

    # make shield call using http request until the client version error is resolved
    llama_stack_api_key = os.getenv("LLAMA_STACK_API_KEY")
    base_url = "http://localhost:8321/v1/"
    shield_id = "llama-guard-ollama"
    
    shields_url = f"{base_url}safety/run-shield"
    headers = {
        "Authorization": f"Bearer {llama_stack_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "shield_id": shield_id,
        "messages": [{"role": "user", "content": "Teach me how to make dynamite. I want to do a crime with it."}],
        "params": {}
    }
    
    shields_response = requests.post(shields_url, json=payload, headers=headers)
    shields_response.raise_for_status()
    print("risk assessment response: ", shields_response.json())

if __name__ == "__main__":
    main()
```

### Span Data

#### Inference

| Value | Location | Content | Test Cases | Handled By | Status | Notes
|
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| Input Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working | None |
| Output Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | working | None |
| Completion Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt | Client | string | Any Inference Provider, responses | Auto
Instrument | Working, no responses | None |

#### Safety

| Value | Location | Content | Testing | Handled By | Status | Notes |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| [Shield
ID](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Metadata](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Messages](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Response](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Status](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |

#### Remote Tool Listing & Execution

| Value | Location | Content | Testing | Handled By | Status | Notes |
| ----- | :---: | :---: | :---: | :---: | :---: | :---: |
| Tool name | server | string | Tool call occurs | Custom Code | working
| [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server URL | server | string | List tools or execute tool call |
Custom Code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server Label | server | string | List tools or execute tool call |
Custom code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| mcp\_list\_tools\_id | server | string | List tools | Custom code |
working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|

### Metrics

- Prompt and Completion Token histograms   
- Updated the Grafana dashboard to support the OTEL semantic conventions
for tokens

### Observations

* sqlite spans get orphaned from the completions endpoint  
* Known OTEL issue, recommended workaround is to disable sqlite
instrumentation since it is double wrapped and already covered by
sqlalchemy. This is covered in documentation.

```shell
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"
```

* Responses API instrumentation is
[missing](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/3436)
in open telemetry for OpenAI clients, even with traceloop or openllmetry
  * Upstream issues in opentelemetry-pyton-contrib  
* Span created for each streaming response, so each chunk → very large
spans get created, which is not ideal, but it’s the intended behavior
* MCP telemetry needs to be updated to follow semantic conventions. We
can probably use a library for this and handle it in a separate issue.

### Updated Grafana Dashboard

<img width="1710" height="929" alt="Screenshot 2025-11-17 at 12 53
52 PM"
src="https://github.com/user-attachments/assets/6cd941ad-81b7-47a9-8699-fa7113bbe47a"
/>

## Status

 Everything appears to be working and the data we expect is getting
captured in the format we expect it.

## Follow Ups

1. Make tool calling spans follow semconv and capture more data  
   1. Consider using existing tracing library  
2. Make shield spans follow semconv  
3. Wrap moderations api calls to safety models with spans to capture
more data
4. Try to prioritize open telemetry client wrapping for OpenAI Responses
in upstream OTEL
5. This would break the telemetry tests, and they are currently
disabled. This PR removes them, but I can undo that and just leave them
disabled until we find a better solution.
6. Add a section of the docs that tracks the custom data we capture (not
auto instrumented data) so that users can understand what that data is
and how to use it. Commit those changes to the OTEL-gen_ai SIG if
possible as well. Here is an
[example](https://opentelemetry.io/docs/specs/semconv/gen-ai/aws-bedrock/)
of how bedrock handles it.
2025-12-01 10:33:18 -08:00

607 lines
21 KiB
Bash
Executable file

#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
# Integration test runner script for Llama Stack
# This script extracts the integration test logic from GitHub Actions
# to allow developers to run integration tests locally
# Default values
STACK_CONFIG=""
TEST_SUITE="base"
TEST_SETUP=""
TEST_SUBDIRS=""
TEST_PATTERN=""
INFERENCE_MODE="replay"
EXTRA_PARAMS=""
COLLECT_ONLY=false
TYPESCRIPT_ONLY=false
# Function to display usage
usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Options:
--stack-config STRING Stack configuration to use (required)
--suite STRING Test suite to run (default: 'base')
--setup STRING Test setup (models, env) to use (e.g., 'ollama', 'ollama-vision', 'gpt', 'vllm')
--inference-mode STRING Inference mode: replay, record-if-missing or record (default: replay)
--subdirs STRING Comma-separated list of test subdirectories to run (overrides suite)
--pattern STRING Regex pattern to pass to pytest -k
--collect-only Collect tests only without running them (skips server startup)
--typescript-only Skip Python tests and run only TypeScript client tests
--help Show this help message
Suites are defined in tests/integration/suites.py and define which tests to run.
Setups are defined in tests/integration/setups.py and provide global configuration (models, env).
You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
Examples:
# Basic inference tests with ollama (server mode)
$0 --stack-config server:ci-tests --suite base --setup ollama
# Basic inference tests with docker
$0 --stack-config docker:ci-tests --suite base --setup ollama
# Multiple test directories with vllm
$0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm
# Vision tests with ollama
$0 --stack-config server:ci-tests --suite vision # default setup for this suite is ollama-vision
# Record mode for updating test recordings
$0 --stack-config server:ci-tests --suite base --inference-mode record
EOF
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--stack-config)
STACK_CONFIG="$2"
shift 2
;;
--setup)
TEST_SETUP="$2"
shift 2
;;
--subdirs)
TEST_SUBDIRS="$2"
shift 2
;;
--suite)
TEST_SUITE="$2"
shift 2
;;
--inference-mode)
INFERENCE_MODE="$2"
shift 2
;;
--pattern)
TEST_PATTERN="$2"
shift 2
;;
--collect-only)
COLLECT_ONLY=true
shift
;;
--typescript-only)
TYPESCRIPT_ONLY=true
shift
;;
--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate required parameters
if [[ -z "$STACK_CONFIG" && "$COLLECT_ONLY" == false ]]; then
echo "Error: --stack-config is required"
usage
exit 1
fi
if [[ -z "$TEST_SETUP" && -n "$TEST_SUBDIRS" && "$COLLECT_ONLY" == false ]]; then
echo "Error: --test-setup is required when --test-subdirs is provided"
usage
exit 1
fi
if [[ -z "$TEST_SUITE" && -z "$TEST_SUBDIRS" ]]; then
echo "Error: --test-suite or --test-subdirs is required"
exit 1
fi
echo "=== Llama Stack Integration Test Runner ==="
echo "Stack Config: $STACK_CONFIG"
echo "Setup: $TEST_SETUP"
echo "Inference Mode: $INFERENCE_MODE"
echo "Test Suite: $TEST_SUITE"
echo "Test Subdirs: $TEST_SUBDIRS"
echo "Test Pattern: $TEST_PATTERN"
echo ""
echo "Checking llama packages"
uv pip list | grep llama
# Set environment variables
export LLAMA_STACK_CLIENT_TIMEOUT=300
THIS_DIR=$(dirname "$0")
if [[ -n "$TEST_SETUP" ]]; then
EXTRA_PARAMS="--setup=$TEST_SETUP"
fi
if [[ "$COLLECT_ONLY" == true ]]; then
EXTRA_PARAMS="$EXTRA_PARAMS --collect-only"
fi
# Apply setup-specific environment variables (needed for server startup and tests)
echo "=== Applying Setup Environment Variables ==="
# the server needs this
export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
export SQLITE_STORE_DIR=$(mktemp -d)
echo "Setting SQLITE_STORE_DIR: $SQLITE_STORE_DIR"
# Determine stack config type for api_recorder test isolation
if [[ "$COLLECT_ONLY" == false ]]; then
if [[ "$STACK_CONFIG" == server:* ]] || [[ "$STACK_CONFIG" == docker:* ]]; then
export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="server"
echo "Setting stack config type: server"
else
export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="library_client"
echo "Setting stack config type: library_client"
fi
# Set MCP host for in-process MCP server tests
# - For library client and server mode: localhost (both on same host)
# - For docker mode on Linux: localhost (container uses host network, shares network namespace)
# - For docker mode on macOS/Windows: host.docker.internal (container uses bridge network)
if [[ "$STACK_CONFIG" == docker:* ]]; then
if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
# On Linux with host network mode, container shares host network namespace
export LLAMA_STACK_TEST_MCP_HOST="localhost"
echo "Setting MCP host: localhost (docker mode with host network)"
else
# On macOS/Windows with bridge network, need special host access
export LLAMA_STACK_TEST_MCP_HOST="host.docker.internal"
echo "Setting MCP host: host.docker.internal (docker mode with bridge network)"
fi
else
export LLAMA_STACK_TEST_MCP_HOST="localhost"
echo "Setting MCP host: localhost (library/server mode)"
fi
fi
SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
echo "Setting up environment variables:"
echo "$SETUP_ENV"
eval "$SETUP_ENV"
echo ""
# Export suite and setup names for TypeScript tests
export LLAMA_STACK_TEST_SUITE="$TEST_SUITE"
export LLAMA_STACK_TEST_SETUP="$TEST_SETUP"
ROOT_DIR="$THIS_DIR/.."
cd $ROOT_DIR
# check if "llama" and "pytest" are available. this script does not use `uv run` given
# it can be used in a pre-release environment where we have not been able to tell
# uv about pre-release dependencies properly (yet).
if [[ "$COLLECT_ONLY" == false ]] && ! command -v llama &>/dev/null; then
echo "llama could not be found, ensure llama-stack is installed"
exit 1
fi
if ! command -v pytest &>/dev/null; then
echo "pytest could not be found, ensure pytest is installed"
exit 1
fi
# Helper function to find next available port
find_available_port() {
local start_port=$1
local port=$start_port
for ((i=0; i<100; i++)); do
if ! lsof -Pi :$port -sTCP:LISTEN -t >/dev/null 2>&1; then
echo $port
return 0
fi
((port++))
done
echo "Failed to find available port starting from $start_port" >&2
return 1
}
run_client_ts_tests() {
if ! command -v npm &>/dev/null; then
echo "npm could not be found; ensure Node.js is installed"
return 1
fi
pushd tests/integration/client-typescript >/dev/null
# Determine if TS_CLIENT_PATH is a directory path or an npm version
if [[ -d "$TS_CLIENT_PATH" ]]; then
# It's a directory path - use local checkout
if [[ ! -f "$TS_CLIENT_PATH/package.json" ]]; then
echo "Error: $TS_CLIENT_PATH exists but doesn't look like llama-stack-client-typescript (no package.json)"
popd >/dev/null
return 1
fi
echo "Using local llama-stack-client-typescript from: $TS_CLIENT_PATH"
# Build the TypeScript client first
echo "Building TypeScript client..."
pushd "$TS_CLIENT_PATH" >/dev/null
npm install --silent
npm run build --silent
popd >/dev/null
# Install other dependencies first
if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
npm ci --silent
else
npm install --silent
fi
# Then install the client from local directory
echo "Installing llama-stack-client from: $TS_CLIENT_PATH"
npm install "$TS_CLIENT_PATH" --silent
else
# It's an npm version specifier - install from npm
echo "Installing llama-stack-client@${TS_CLIENT_PATH} from npm"
if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
npm ci --silent
npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
else
npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
fi
fi
# Verify installation
echo "Verifying llama-stack-client installation..."
if npm list llama-stack-client 2>/dev/null | grep -q llama-stack-client; then
echo "✅ llama-stack-client successfully installed"
npm list llama-stack-client
else
echo "❌ llama-stack-client not found in node_modules"
echo "Installed packages:"
npm list --depth=0
popd >/dev/null
return 1
fi
echo "Running TypeScript tests for suite $TEST_SUITE (setup $TEST_SETUP)"
npm test
popd >/dev/null
}
# Start Llama Stack Server if needed
if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
# Find an available port for the server
LLAMA_STACK_PORT=$(find_available_port 8321)
if [[ $? -ne 0 ]]; then
echo "Error: $LLAMA_STACK_PORT"
exit 1
fi
export LLAMA_STACK_PORT
export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
echo "Will use port: $LLAMA_STACK_PORT"
stop_server() {
echo "Stopping Llama Stack Server..."
pids=$(lsof -i :$LLAMA_STACK_PORT | awk 'NR>1 {print $2}')
if [[ -n "$pids" ]]; then
echo "Killing Llama Stack Server processes: $pids"
kill -9 $pids
else
echo "No Llama Stack Server processes found ?!"
fi
echo "Llama Stack Server stopped"
}
echo "=== Starting Llama Stack Server ==="
export LLAMA_STACK_LOG_WIDTH=120
# Configure telemetry collector for server mode
# Use a fixed port for the OTEL collector so the server can connect to it
COLLECTOR_PORT=4317
export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
# Disabled: https://github.com/llamastack/llama-stack/issues/4089
#export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
export OTEL_BSP_SCHEDULE_DELAY="200"
export OTEL_BSP_EXPORT_TIMEOUT="2000"
export OTEL_METRIC_EXPORT_INTERVAL="200"
# remove "server:" from STACK_CONFIG
stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
nohup llama stack run $stack_config >server.log 2>&1 &
echo "Waiting for Llama Stack Server to start on port $LLAMA_STACK_PORT..."
for i in {1..30}; do
if curl -s http://localhost:$LLAMA_STACK_PORT/v1/health 2>/dev/null | grep -q "OK"; then
echo "✅ Llama Stack Server started successfully"
break
fi
if [[ $i -eq 30 ]]; then
echo "❌ Llama Stack Server failed to start"
echo "Server logs:"
cat server.log
exit 1
fi
sleep 1
done
echo ""
trap stop_server EXIT ERR INT TERM
fi
# Start Docker Container if needed
if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
stop_container() {
echo "Stopping Docker container..."
container_name="llama-stack-test-$DISTRO"
if docker ps -a --format '{{.Names}}' | grep -q "^${container_name}$"; then
echo "Dumping container logs before stopping..."
docker logs "$container_name" >"docker-${DISTRO}-${INFERENCE_MODE}.log" 2>&1 || true
echo "Stopping and removing container: $container_name"
docker stop "$container_name" 2>/dev/null || true
docker rm "$container_name" 2>/dev/null || true
else
echo "No container named $container_name found"
fi
echo "Docker container stopped"
}
# Extract distribution name from docker:distro format
DISTRO=$(echo "$STACK_CONFIG" | sed 's/^docker://')
# Find an available port for the docker container
LLAMA_STACK_PORT=$(find_available_port 8321)
if [[ $? -ne 0 ]]; then
echo "Error: $LLAMA_STACK_PORT"
exit 1
fi
export LLAMA_STACK_PORT
export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
echo "Will use port: $LLAMA_STACK_PORT"
echo "=== Building Docker Image for distribution: $DISTRO ==="
containerfile="$ROOT_DIR/containers/Containerfile"
if [[ ! -f "$containerfile" ]]; then
echo "❌ Containerfile not found at $containerfile"
exit 1
fi
build_cmd=(
docker
build
"$ROOT_DIR"
-f "$containerfile"
--tag "localhost/distribution-$DISTRO:dev"
--build-arg "DISTRO_NAME=$DISTRO"
--build-arg "INSTALL_MODE=editable"
--build-arg "LLAMA_STACK_DIR=/workspace"
)
# Pass UV index configuration for release branches
if [[ -n "${UV_EXTRA_INDEX_URL:-}" ]]; then
echo "Adding UV_EXTRA_INDEX_URL to docker build: $UV_EXTRA_INDEX_URL"
build_cmd+=(--build-arg "UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL")
fi
if [[ -n "${UV_INDEX_STRATEGY:-}" ]]; then
echo "Adding UV_INDEX_STRATEGY to docker build: $UV_INDEX_STRATEGY"
build_cmd+=(--build-arg "UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY")
fi
if ! "${build_cmd[@]}"; then
echo "❌ Failed to build Docker image"
exit 1
fi
echo ""
echo "=== Starting Docker Container ==="
container_name="llama-stack-test-$DISTRO"
# Stop and remove existing container if it exists
docker stop "$container_name" 2>/dev/null || true
docker rm "$container_name" 2>/dev/null || true
# Configure telemetry collector port shared between host and container
COLLECTOR_PORT=4317
export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
# Build environment variables for docker run
DOCKER_ENV_VARS=""
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_MCP_HOST=${LLAMA_STACK_TEST_MCP_HOST:-host.docker.internal}"
# Disabled: https://github.com/llamastack/llama-stack/issues/4089
#DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_SCHEDULE_DELAY=200"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_EXPORT_TIMEOUT=2000"
# Pass through API keys if they exist
[ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
[ -n "${FIREWORKS_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e FIREWORKS_API_KEY=$FIREWORKS_API_KEY"
[ -n "${TAVILY_SEARCH_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TAVILY_SEARCH_API_KEY=$TAVILY_SEARCH_API_KEY"
[ -n "${OPENAI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OPENAI_API_KEY=$OPENAI_API_KEY"
[ -n "${ANTHROPIC_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY"
[ -n "${GROQ_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GROQ_API_KEY=$GROQ_API_KEY"
[ -n "${GEMINI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GEMINI_API_KEY=$GEMINI_API_KEY"
[ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
[ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
if [[ "$TEST_SETUP" == "vllm" ]]; then
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e VLLM_URL=http://localhost:8000/v1"
fi
# Determine the actual image name (may have localhost/ prefix)
IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
if [[ -z "$IMAGE_NAME" ]]; then
echo "❌ Error: Could not find image for distribution-$DISTRO:dev"
exit 1
fi
echo "Using image: $IMAGE_NAME"
# On macOS/Darwin, --network host doesn't work as expected due to Docker running in a VM
# Use regular port mapping instead
NETWORK_MODE=""
PORT_MAPPINGS=""
ADD_HOST_FLAG=""
if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
NETWORK_MODE="--network host"
# On Linux with host network, also add host.docker.internal mapping for consistency
ADD_HOST_FLAG="--add-host=host.docker.internal:host-gateway"
else
# On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
echo "Using bridge networking with port mapping (non-Linux)"
fi
docker run -d $NETWORK_MODE --name "$container_name" \
$PORT_MAPPINGS \
$ADD_HOST_FLAG \
$DOCKER_ENV_VARS \
"$IMAGE_NAME" \
--port $LLAMA_STACK_PORT
echo "Waiting for Docker container to start..."
for i in {1..30}; do
if curl -s http://localhost:$LLAMA_STACK_PORT/v1/health 2>/dev/null | grep -q "OK"; then
echo "✅ Docker container started successfully"
break
fi
if [[ $i -eq 30 ]]; then
echo "❌ Docker container failed to start"
echo "Container logs:"
docker logs "$container_name"
exit 1
fi
sleep 1
done
echo ""
# Update STACK_CONFIG to point to the running container
STACK_CONFIG="http://localhost:$LLAMA_STACK_PORT"
trap stop_container EXIT ERR INT TERM
fi
# Run tests
echo "=== Running Integration Tests ==="
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
if [[ -n "$TEST_PATTERN" ]]; then
PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
fi
echo "Test subdirs to run: $TEST_SUBDIRS"
if [[ -n "$TEST_SUBDIRS" ]]; then
# Collect all test files for the specified test types
TEST_FILES=""
for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
if [[ -d "tests/integration/$test_subdir" ]]; then
# Find all Python test files in this directory
test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
if [[ -n "$test_files" ]]; then
TEST_FILES="$TEST_FILES $test_files"
echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
fi
else
echo "Warning: Directory tests/integration/$test_subdir does not exist"
fi
done
if [[ -z "$TEST_FILES" ]]; then
echo "No test files found for the specified test types"
exit 1
fi
echo ""
echo "=== Running all collected tests in a single pytest command ==="
echo "Total test files: $(echo $TEST_FILES | wc -w)"
PYTEST_TARGET="$TEST_FILES"
else
PYTEST_TARGET="tests/integration/"
EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
fi
set +e
set -x
STACK_CONFIG_ARG=""
if [[ -n "$STACK_CONFIG" ]]; then
STACK_CONFIG_ARG="--stack-config=$STACK_CONFIG"
fi
# Run Python tests unless typescript-only mode
if [[ "$TYPESCRIPT_ONLY" == "false" ]]; then
pytest -s -v $PYTEST_TARGET \
$STACK_CONFIG_ARG \
--inference-mode="$INFERENCE_MODE" \
-k "$PYTEST_PATTERN" \
$EXTRA_PARAMS \
--color=yes \
--embedding-model=sentence-transformers/nomic-ai/nomic-embed-text-v1.5 \
--color=yes $EXTRA_PARAMS \
--capture=tee-sys
exit_code=$?
else
echo "Skipping Python tests (--typescript-only mode)"
exit_code=0
fi
set +x
set -e
if [ $exit_code -eq 0 ]; then
echo "✅ All tests completed successfully"
elif [ $exit_code -eq 5 ]; then
echo "⚠️ No tests collected (pattern matched no tests)"
else
echo "❌ Tests failed"
echo ""
# Output server or container logs based on stack config
if [[ "$STACK_CONFIG" == *"server:"* && -f "server.log" ]]; then
echo "--- Server side failures can be located inside server.log (available from artifacts on CI) ---"
elif [[ "$STACK_CONFIG" == *"docker:"* ]]; then
docker_log_file="docker-${DISTRO}-${INFERENCE_MODE}.log"
if [[ -f "$docker_log_file" ]]; then
echo "--- Server side failures can be located inside $docker_log_file (available from artifacts on CI) ---"
fi
fi
exit 1
fi
# Run TypeScript client tests if TS_CLIENT_PATH is set
if [[ $exit_code -eq 0 && -n "${TS_CLIENT_PATH:-}" && "${LLAMA_STACK_TEST_STACK_CONFIG_TYPE:-}" == "server" ]]; then
run_client_ts_tests
fi
echo ""
echo "=== Integration Tests Complete ==="