mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-12 05:54:38 +00:00
feat(tests): introduce a test "suite" concept to encompass dirs, options (#3339)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Python Package Build Test / build (3.13) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 4s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 4s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 3s
Test External API and Providers / test-external (venv) (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
UI Tests / ui-tests (22) (push) Successful in 33s
Pre-commit / pre-commit (push) Successful in 1m15s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Python Package Build Test / build (3.13) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 4s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 4s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 3s
Test External API and Providers / test-external (venv) (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
UI Tests / ui-tests (22) (push) Successful in 33s
Pre-commit / pre-commit (push) Successful in 1m15s
Our integration tests need to be 'grouped' because each group often needs a specific set of models it works with. We separated vision tests due to this, and we have a separate set of tests which test "Responses" API. This PR makes this system a bit more official so it is very easy to target these groups and apply all testing infrastructure towards all the groups (for example, record-replay) uniformly. There are three suites declared: - base - vision - responses Note that our CI currently runs the "base" and "vision" suites. You can use the `--suite` option when running pytest (or any of the testing scripts or workflows.) For example: ``` OLLAMA_URL=http://localhost:11434 \ pytest -s -v tests/integration/ --stack-config starter --suite vision ```
This commit is contained in:
parent
0c2757a05b
commit
47b640370e
25 changed files with 255 additions and 161 deletions
5
tests/integration/responses/fixtures/__init__.py
Normal file
5
tests/integration/responses/fixtures/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
121
tests/integration/responses/fixtures/fixtures.py
Normal file
121
tests/integration/responses/fixtures/fixtures.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from openai import OpenAI
|
||||
|
||||
from llama_stack import LlamaStackAsLibraryClient
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
|
||||
def _load_all_verification_configs():
|
||||
"""Load and aggregate verification configs from the conf/ directory."""
|
||||
# Note: Path is relative to *this* file (fixtures.py)
|
||||
conf_dir = Path(__file__).parent.parent.parent / "conf"
|
||||
if not conf_dir.is_dir():
|
||||
# Use pytest.fail if called during test collection, otherwise raise error
|
||||
# For simplicity here, we'll raise an error, assuming direct calls
|
||||
# are less likely or can handle it.
|
||||
raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
|
||||
|
||||
all_provider_configs = {}
|
||||
yaml_files = list(conf_dir.glob("*.yaml"))
|
||||
if not yaml_files:
|
||||
raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
|
||||
|
||||
for config_path in yaml_files:
|
||||
provider_name = config_path.stem
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
provider_config = yaml.safe_load(f)
|
||||
if provider_config:
|
||||
all_provider_configs[provider_name] = provider_config
|
||||
else:
|
||||
# Log warning if possible, or just skip empty files silently
|
||||
print(f"Warning: Config file {config_path} is empty or invalid.")
|
||||
except Exception as e:
|
||||
raise OSError(f"Error loading config file {config_path}: {e}") from e
|
||||
|
||||
return {"providers": all_provider_configs}
|
||||
|
||||
|
||||
# --- End Helper Functions ---
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def verification_config():
|
||||
"""Pytest fixture to provide the loaded verification config."""
|
||||
try:
|
||||
return _load_all_verification_configs()
|
||||
except (OSError, FileNotFoundError) as e:
|
||||
pytest.fail(str(e)) # Fail test collection if config loading fails
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def provider(request, verification_config):
|
||||
provider = request.config.getoption("--provider")
|
||||
base_url = request.config.getoption("--base-url")
|
||||
|
||||
if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
|
||||
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
||||
|
||||
if not provider:
|
||||
if not base_url:
|
||||
raise ValueError("Provider and base URL are not provided")
|
||||
for provider, metadata in verification_config["providers"].items():
|
||||
if metadata["base_url"] == base_url:
|
||||
provider = provider
|
||||
break
|
||||
|
||||
return provider
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base_url(request, provider, verification_config):
|
||||
return request.config.getoption("--base-url") or verification_config.get("providers", {}).get(provider, {}).get(
|
||||
"base_url"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def api_key(request, provider, verification_config):
|
||||
provider_conf = verification_config.get("providers", {}).get(provider, {})
|
||||
api_key_env_var = provider_conf.get("api_key_var")
|
||||
|
||||
key_from_option = request.config.getoption("--api-key")
|
||||
key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
|
||||
|
||||
final_key = key_from_option or key_from_env
|
||||
return final_key
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_mapping(provider, providers_model_mapping):
|
||||
return providers_model_mapping[provider]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def openai_client(base_url, api_key, provider):
|
||||
# Simplify running against a local Llama Stack
|
||||
if base_url and "localhost" in base_url and not api_key:
|
||||
api_key = "empty"
|
||||
if provider.startswith("stack:"):
|
||||
parts = provider.split(":")
|
||||
if len(parts) != 2:
|
||||
raise ValueError(f"Invalid config for Llama Stack: {provider}, it must be of the form 'stack:<config>'")
|
||||
config = parts[1]
|
||||
client = LlamaStackAsLibraryClient(config, skip_logger_removal=True)
|
||||
return client
|
||||
|
||||
return OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
)
|
BIN
tests/integration/responses/fixtures/images/vision_test_1.jpg
Normal file
BIN
tests/integration/responses/fixtures/images/vision_test_1.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 108 KiB |
BIN
tests/integration/responses/fixtures/images/vision_test_2.jpg
Normal file
BIN
tests/integration/responses/fixtures/images/vision_test_2.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 148 KiB |
BIN
tests/integration/responses/fixtures/images/vision_test_3.jpg
Normal file
BIN
tests/integration/responses/fixtures/images/vision_test_3.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 139 KiB |
Binary file not shown.
262
tests/integration/responses/fixtures/test_cases.py
Normal file
262
tests/integration/responses/fixtures/test_cases.py
Normal file
|
@ -0,0 +1,262 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ResponsesTestCase(BaseModel):
|
||||
# Input can be a simple string or complex message structure
|
||||
input: str | list[dict[str, Any]]
|
||||
expected: str
|
||||
# Tools as flexible dict structure (gets validated at runtime by the API)
|
||||
tools: list[dict[str, Any]] | None = None
|
||||
# Multi-turn conversations with input/output pairs
|
||||
turns: list[tuple[str | list[dict[str, Any]], str]] | None = None
|
||||
# File search specific fields
|
||||
file_content: str | None = None
|
||||
file_path: str | None = None
|
||||
# Streaming flag
|
||||
stream: bool | None = None
|
||||
|
||||
|
||||
# Basic response test cases
|
||||
basic_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="Which planet do humans live on?",
|
||||
expected="earth",
|
||||
),
|
||||
id="earth",
|
||||
),
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="Which planet has rings around it with a name starting with letter S?",
|
||||
expected="saturn",
|
||||
),
|
||||
id="saturn",
|
||||
),
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "what teams are playing in this image?",
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg",
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
expected="brooklyn nets",
|
||||
),
|
||||
id="image_input",
|
||||
),
|
||||
]
|
||||
|
||||
# Multi-turn test cases
|
||||
multi_turn_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="", # Not used for multi-turn
|
||||
expected="", # Not used for multi-turn
|
||||
turns=[
|
||||
("Which planet do humans live on?", "earth"),
|
||||
("What is the name of the planet from your previous response?", "earth"),
|
||||
],
|
||||
),
|
||||
id="earth",
|
||||
),
|
||||
]
|
||||
|
||||
# Web search test cases
|
||||
web_search_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="How many experts does the Llama 4 Maverick model have?",
|
||||
tools=[{"type": "web_search", "search_context_size": "low"}],
|
||||
expected="128",
|
||||
),
|
||||
id="llama_experts",
|
||||
),
|
||||
]
|
||||
|
||||
# File search test cases
|
||||
file_search_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="How many experts does the Llama 4 Maverick model have?",
|
||||
tools=[{"type": "file_search"}],
|
||||
expected="128",
|
||||
file_content="Llama 4 Maverick has 128 experts",
|
||||
),
|
||||
id="llama_experts",
|
||||
),
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="How many experts does the Llama 4 Maverick model have?",
|
||||
tools=[{"type": "file_search"}],
|
||||
expected="128",
|
||||
file_path="pdfs/llama_stack_and_models.pdf",
|
||||
),
|
||||
id="llama_experts_pdf",
|
||||
),
|
||||
]
|
||||
|
||||
# MCP tool test cases
|
||||
mcp_tool_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="What is the boiling point of myawesomeliquid in Celsius?",
|
||||
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
|
||||
expected="Hello, world!",
|
||||
),
|
||||
id="boiling_point_tool",
|
||||
),
|
||||
]
|
||||
|
||||
# Custom tool test cases
|
||||
custom_tool_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="What's the weather like in San Francisco?",
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for a given location.",
|
||||
"parameters": {
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"location": {
|
||||
"description": "City and country e.g. Bogotá, Colombia",
|
||||
"type": "string",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
"type": "object",
|
||||
},
|
||||
}
|
||||
],
|
||||
expected="", # No specific expected output for custom tools
|
||||
),
|
||||
id="sf_weather",
|
||||
),
|
||||
]
|
||||
|
||||
# Image test cases
|
||||
image_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "Identify the type of animal in this image.",
|
||||
},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
expected="llama",
|
||||
),
|
||||
id="llama_image",
|
||||
),
|
||||
]
|
||||
|
||||
# Multi-turn image test cases
|
||||
multi_turn_image_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="", # Not used for multi-turn
|
||||
expected="", # Not used for multi-turn
|
||||
turns=[
|
||||
(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'.",
|
||||
},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
"llama",
|
||||
),
|
||||
(
|
||||
"What country do you find this animal primarily in? What continent?",
|
||||
"peru",
|
||||
),
|
||||
],
|
||||
),
|
||||
id="llama_image_understanding",
|
||||
),
|
||||
]
|
||||
|
||||
# Multi-turn tool execution test cases
|
||||
multi_turn_tool_execution_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
|
||||
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
|
||||
expected="yes",
|
||||
),
|
||||
id="user_file_access_check",
|
||||
),
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me the boiling point in Celsius.",
|
||||
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
|
||||
expected="100°C",
|
||||
),
|
||||
id="experiment_results_lookup",
|
||||
),
|
||||
]
|
||||
|
||||
# Multi-turn tool execution streaming test cases
|
||||
multi_turn_tool_execution_streaming_test_cases = [
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
|
||||
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
|
||||
expected="no",
|
||||
stream=True,
|
||||
),
|
||||
id="user_permissions_workflow",
|
||||
),
|
||||
pytest.param(
|
||||
ResponsesTestCase(
|
||||
input="I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Return only one tool call per step. Please stream your analysis process.",
|
||||
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
|
||||
expected="85%",
|
||||
stream=True,
|
||||
),
|
||||
id="experiment_analysis_streaming",
|
||||
),
|
||||
]
|
Loading…
Add table
Add a link
Reference in a new issue