chore(tests): refactor and move responses tests away from verifications (#3068)

This PR kills the verifications infrastructure which is no longer used.
It was relocated to the `llama-stack-evals`
(https://github.com/meta-llama/llama-stack-evals) repository previously.

Responses tests used this infrastructure but that wasn't quite
necessary, just a little useful back when @bbrownin introduced the
tests. On Discord, we agreed that tests can be moved to our regular
integrations test infra.

## Test Plan

Some tests currently do fail (although they run!) I will send a
follow-up PR which makes them all pass.
This commit is contained in:
Ashwin Bharambe 2025-08-07 13:48:16 -07:00 committed by GitHub
parent 342550c1e2
commit 5f1ddd35e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 93 additions and 13032 deletions

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,137 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import re
from pathlib import Path
import pytest
import yaml
from openai import OpenAI
from llama_stack import LlamaStackAsLibraryClient
# --- Helper Functions ---
def _load_all_verification_configs():
"""Load and aggregate verification configs from the conf/ directory."""
# Note: Path is relative to *this* file (fixtures.py)
conf_dir = Path(__file__).parent.parent.parent / "conf"
if not conf_dir.is_dir():
# Use pytest.fail if called during test collection, otherwise raise error
# For simplicity here, we'll raise an error, assuming direct calls
# are less likely or can handle it.
raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
all_provider_configs = {}
yaml_files = list(conf_dir.glob("*.yaml"))
if not yaml_files:
raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
for config_path in yaml_files:
provider_name = config_path.stem
try:
with open(config_path) as f:
provider_config = yaml.safe_load(f)
if provider_config:
all_provider_configs[provider_name] = provider_config
else:
# Log warning if possible, or just skip empty files silently
print(f"Warning: Config file {config_path} is empty or invalid.")
except Exception as e:
raise OSError(f"Error loading config file {config_path}: {e}") from e
return {"providers": all_provider_configs}
def case_id_generator(case):
"""Generate a test ID from the case's 'case_id' field, or use a default."""
case_id = case.get("case_id")
if isinstance(case_id, str | int):
return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
return None
# Helper to get the base test name from the request object
def get_base_test_name(request):
return request.node.originalname
# --- End Helper Functions ---
@pytest.fixture(scope="session")
def verification_config():
"""Pytest fixture to provide the loaded verification config."""
try:
return _load_all_verification_configs()
except (OSError, FileNotFoundError) as e:
pytest.fail(str(e)) # Fail test collection if config loading fails
@pytest.fixture(scope="session")
def provider(request, verification_config):
provider = request.config.getoption("--provider")
base_url = request.config.getoption("--base-url")
if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
if not provider:
if not base_url:
raise ValueError("Provider and base URL are not provided")
for provider, metadata in verification_config["providers"].items():
if metadata["base_url"] == base_url:
provider = provider
break
return provider
@pytest.fixture(scope="session")
def base_url(request, provider, verification_config):
return request.config.getoption("--base-url") or verification_config.get("providers", {}).get(provider, {}).get(
"base_url"
)
@pytest.fixture(scope="session")
def api_key(request, provider, verification_config):
provider_conf = verification_config.get("providers", {}).get(provider, {})
api_key_env_var = provider_conf.get("api_key_var")
key_from_option = request.config.getoption("--api-key")
key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
final_key = key_from_option or key_from_env
return final_key
@pytest.fixture
def model_mapping(provider, providers_model_mapping):
return providers_model_mapping[provider]
@pytest.fixture(scope="session")
def openai_client(base_url, api_key, provider):
# Simplify running against a local Llama Stack
if base_url and "localhost" in base_url and not api_key:
api_key = "empty"
if provider.startswith("stack:"):
parts = provider.split(":")
if len(parts) != 2:
raise ValueError(f"Invalid config for Llama Stack: {provider}, it must be of the form 'stack:<config>'")
config = parts[1]
client = LlamaStackAsLibraryClient(config, skip_logger_removal=True)
if not client.initialize():
raise RuntimeError("Initialization failed")
return client
return OpenAI(
base_url=base_url,
api_key=api_key,
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

View file

@ -0,0 +1,16 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
import yaml
def load_test_cases(name: str):
fixture_dir = Path(__file__).parent / "test_cases"
yaml_path = fixture_dir / f"{name}.yaml"
with open(yaml_path) as f:
return yaml.safe_load(f)

View file

@ -0,0 +1,397 @@
test_chat_basic:
test_name: test_chat_basic
test_params:
case:
- case_id: "earth"
input:
messages:
- content: Which planet do humans live on?
role: user
output: Earth
- case_id: "saturn"
input:
messages:
- content: Which planet has rings around it with a name starting with letter
S?
role: user
output: Saturn
test_chat_input_validation:
test_name: test_chat_input_validation
test_params:
case:
- case_id: "messages_missing"
input:
messages: []
output:
error:
status_code: 400
- case_id: "messages_role_invalid"
input:
messages:
- content: Which planet do humans live on?
role: fake_role
output:
error:
status_code: 400
- case_id: "tool_choice_invalid"
input:
messages:
- content: Which planet do humans live on?
role: user
tool_choice: invalid
output:
error:
status_code: 400
- case_id: "tool_choice_no_tools"
input:
messages:
- content: Which planet do humans live on?
role: user
tool_choice: required
output:
error:
status_code: 400
- case_id: "tools_type_invalid"
input:
messages:
- content: Which planet do humans live on?
role: user
tools:
- type: invalid
output:
error:
status_code: 400
test_chat_image:
test_name: test_chat_image
test_params:
case:
- input:
messages:
- content:
- text: What is in this image?
type: text
- image_url:
url: https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg
type: image_url
role: user
output: llama
test_chat_structured_output:
test_name: test_chat_structured_output
test_params:
case:
- case_id: "calendar"
input:
messages:
- content: Extract the event information.
role: system
- content: Alice and Bob are going to a science fair on Friday.
role: user
response_format:
json_schema:
name: calendar_event
schema:
properties:
date:
title: Date
type: string
name:
title: Name
type: string
participants:
items:
type: string
title: Participants
type: array
required:
- name
- date
- participants
title: CalendarEvent
type: object
type: json_schema
output: valid_calendar_event
- case_id: "math"
input:
messages:
- content: You are a helpful math tutor. Guide the user through the solution
step by step.
role: system
- content: how can I solve 8x + 7 = -23
role: user
response_format:
json_schema:
name: math_reasoning
schema:
$defs:
Step:
properties:
explanation:
title: Explanation
type: string
output:
title: Output
type: string
required:
- explanation
- output
title: Step
type: object
properties:
final_answer:
title: Final Answer
type: string
steps:
items:
$ref: '#/$defs/Step'
title: Steps
type: array
required:
- steps
- final_answer
title: MathReasoning
type: object
type: json_schema
output: valid_math_reasoning
test_tool_calling:
test_name: test_tool_calling
test_params:
case:
- input:
messages:
- content: You are a helpful assistant that can use tools to get information.
role: system
- content: What's the weather like in San Francisco?
role: user
tools:
- function:
description: Get current temperature for a given location.
name: get_weather
parameters:
additionalProperties: false
properties:
location:
description: "City and country e.g. Bogot\xE1, Colombia"
type: string
required:
- location
type: object
type: function
output: get_weather_tool_call
test_chat_multi_turn_tool_calling:
test_name: test_chat_multi_turn_tool_calling
test_params:
case:
- case_id: "text_then_weather_tool"
input:
messages:
- - role: user
content: "What's the name of the Sun in latin?"
- - role: user
content: "What's the weather like in San Francisco?"
tools:
- function:
description: Get the current weather
name: get_weather
parameters:
type: object
properties:
location:
description: "The city and state (both required), e.g. San Francisco, CA."
type: string
required: ["location"]
type: function
tool_responses:
- response: "{'response': '70 degrees and foggy'}"
expected:
- num_tool_calls: 0
answer: ["sol"]
- num_tool_calls: 1
tool_name: get_weather
tool_arguments:
location: "San Francisco, CA"
- num_tool_calls: 0
answer: ["foggy", "70 degrees"]
- case_id: "weather_tool_then_text"
input:
messages:
- - role: user
content: "What's the weather like in San Francisco?"
tools:
- function:
description: Get the current weather
name: get_weather
parameters:
type: object
properties:
location:
description: "The city and state (both required), e.g. San Francisco, CA."
type: string
required: ["location"]
type: function
tool_responses:
- response: "{'response': '70 degrees and foggy'}"
expected:
- num_tool_calls: 1
tool_name: get_weather
tool_arguments:
location: "San Francisco, CA"
- num_tool_calls: 0
answer: ["foggy", "70 degrees"]
- case_id: "add_product_tool"
input:
messages:
- - role: user
content: "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id."
tools:
- function:
description: Add a new product
name: addProduct
parameters:
type: object
properties:
name:
description: "Name of the product"
type: string
price:
description: "Price of the product"
type: number
inStock:
description: "Availability status of the product."
type: boolean
tags:
description: "List of product tags"
type: array
items:
type: string
required: ["name", "price", "inStock"]
type: function
tool_responses:
- response: "{'response': 'Successfully added product with id: 123'}"
expected:
- num_tool_calls: 1
tool_name: addProduct
tool_arguments:
name: "Widget"
price: 19.99
inStock: true
tags:
- "new"
- "sale"
- num_tool_calls: 0
answer: ["123", "product id: 123"]
- case_id: "get_then_create_event_tool"
input:
messages:
- - role: system
content: "Todays date is 2025-03-01."
- role: user
content: "Do i have any meetings on March 3rd at 10 am? Yes or no?"
- - role: user
content: "Alright then, Create an event named 'Team Building', scheduled for that time same time, in the 'Main Conference Room' and add Alice, Bob, Charlie to it. Give me the created event id."
tools:
- function:
description: Create a new event
name: create_event
parameters:
type: object
properties:
name:
description: "Name of the event"
type: string
date:
description: "Date of the event in ISO format"
type: string
time:
description: "Event Time (HH:MM)"
type: string
location:
description: "Location of the event"
type: string
participants:
description: "List of participant names"
type: array
items:
type: string
required: ["name", "date", "time", "location", "participants"]
type: function
- function:
description: Get an event by date and time
name: get_event
parameters:
type: object
properties:
date:
description: "Date of the event in ISO format"
type: string
time:
description: "Event Time (HH:MM)"
type: string
required: ["date", "time"]
type: function
tool_responses:
- response: "{'response': 'No events found for 2025-03-03 at 10:00'}"
- response: "{'response': 'Successfully created new event with id: e_123'}"
expected:
- num_tool_calls: 1
tool_name: get_event
tool_arguments:
date: "2025-03-03"
time: "10:00"
- num_tool_calls: 0
answer: ["no", "no events found", "no meetings"]
- num_tool_calls: 1
tool_name: create_event
tool_arguments:
name: "Team Building"
date: "2025-03-03"
time: "10:00"
location: "Main Conference Room"
participants:
- "Alice"
- "Bob"
- "Charlie"
- num_tool_calls: 0
answer: ["e_123", "event id: e_123"]
- case_id: "compare_monthly_expense_tool"
input:
messages:
- - role: system
content: "Todays date is 2025-03-01."
- role: user
content: "what was my monthly expense in Jan of this year?"
- - role: user
content: "Was it less than Feb of last year? Only answer with yes or no."
tools:
- function:
description: Get monthly expense summary
name: getMonthlyExpenseSummary
parameters:
type: object
properties:
month:
description: "Month of the year (1-12)"
type: integer
year:
description: "Year"
type: integer
required: ["month", "year"]
type: function
tool_responses:
- response: "{'response': 'Total expenses for January 2025: $1000'}"
- response: "{'response': 'Total expenses for February 2024: $2000'}"
expected:
- num_tool_calls: 1
tool_name: getMonthlyExpenseSummary
tool_arguments:
month: 1
year: 2025
- num_tool_calls: 0
answer: ["1000", "$1,000", "1,000"]
- num_tool_calls: 1
tool_name: getMonthlyExpenseSummary
tool_arguments:
month: 2
year: 2024
- num_tool_calls: 0
answer: ["yes"]

View file

@ -0,0 +1,166 @@
test_response_basic:
test_name: test_response_basic
test_params:
case:
- case_id: "earth"
input: "Which planet do humans live on?"
output: "earth"
- case_id: "saturn"
input: "Which planet has rings around it with a name starting with letter S?"
output: "saturn"
- case_id: "image_input"
input:
- role: user
content:
- type: input_text
text: "what teams are playing in this image?"
- role: user
content:
- type: input_image
image_url: "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg"
output: "brooklyn nets"
test_response_multi_turn:
test_name: test_response_multi_turn
test_params:
case:
- case_id: "earth"
turns:
- input: "Which planet do humans live on?"
output: "earth"
- input: "What is the name of the planet from your previous response?"
output: "earth"
test_response_web_search:
test_name: test_response_web_search
test_params:
case:
- case_id: "llama_experts"
input: "How many experts does the Llama 4 Maverick model have?"
tools:
- type: web_search
search_context_size: "low"
output: "128"
test_response_file_search:
test_name: test_response_file_search
test_params:
case:
- case_id: "llama_experts"
input: "How many experts does the Llama 4 Maverick model have?"
tools:
- type: file_search
# vector_store_ids param for file_search tool gets added by the test runner
file_content: "Llama 4 Maverick has 128 experts"
output: "128"
- case_id: "llama_experts_pdf"
input: "How many experts does the Llama 4 Maverick model have?"
tools:
- type: file_search
# vector_store_ids param for file_search toolgets added by the test runner
file_path: "pdfs/llama_stack_and_models.pdf"
output: "128"
test_response_mcp_tool:
test_name: test_response_mcp_tool
test_params:
case:
- case_id: "boiling_point_tool"
input: "What is the boiling point of myawesomeliquid in Celsius?"
tools:
- type: mcp
server_label: "localmcp"
server_url: "<FILLED_BY_TEST_RUNNER>"
output: "Hello, world!"
test_response_custom_tool:
test_name: test_response_custom_tool
test_params:
case:
- case_id: "sf_weather"
input: "What's the weather like in San Francisco?"
tools:
- type: function
name: get_weather
description: Get current temperature for a given location.
parameters:
additionalProperties: false
properties:
location:
description: "City and country e.g. Bogot\xE1, Colombia"
type: string
required:
- location
type: object
test_response_image:
test_name: test_response_image
test_params:
case:
- case_id: "llama_image"
input:
- role: user
content:
- type: input_text
text: "Identify the type of animal in this image."
- type: input_image
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
output: "llama"
# the models are really poor at tool calling after seeing images :/
test_response_multi_turn_image:
test_name: test_response_multi_turn_image
test_params:
case:
- case_id: "llama_image_understanding"
turns:
- input:
- role: user
content:
- type: input_text
text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
- type: input_image
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
output: "llama"
- input: "What country do you find this animal primarily in? What continent?"
output: "peru"
test_response_multi_turn_tool_execution:
test_name: test_response_multi_turn_tool_execution
test_params:
case:
- case_id: "user_file_access_check"
input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response."
tools:
- type: mcp
server_label: "localmcp"
server_url: "<FILLED_BY_TEST_RUNNER>"
output: "yes"
- case_id: "experiment_results_lookup"
input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found."
tools:
- type: mcp
server_label: "localmcp"
server_url: "<FILLED_BY_TEST_RUNNER>"
output: "100°C"
test_response_multi_turn_tool_execution_streaming:
test_name: test_response_multi_turn_tool_execution_streaming
test_params:
case:
- case_id: "user_permissions_workflow"
input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step."
tools:
- type: mcp
server_label: "localmcp"
server_url: "<FILLED_BY_TEST_RUNNER>"
stream: true
output: "no"
- case_id: "experiment_analysis_streaming"
input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process."
tools:
- type: mcp
server_label: "localmcp"
server_url: "<FILLED_BY_TEST_RUNNER>"
stream: true
output: "85%"