llama-stack-mirror/tests/integration/responses/fixtures/test_cases.py
Derek Higgins 8951765584 test: improve test reliability and model compatibility
- Update earth question to be more specific with multiple choice format
  to prevent Llama-3.2-1B-Instruct from rambling about other planets
- Skip test_text_chat_completion_structured_output as it sometimes
  times out during CI execution again with Llama-3.2-1B-Instruct on vllm

Signed-off-by: Derek Higgins <derekh@redhat.com>
2025-09-24 09:38:42 +01:00

262 lines
9.1 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any
import pytest
from pydantic import BaseModel
class ResponsesTestCase(BaseModel):
# Input can be a simple string or complex message structure
input: str | list[dict[str, Any]]
expected: str
# Tools as flexible dict structure (gets validated at runtime by the API)
tools: list[dict[str, Any]] | None = None
# Multi-turn conversations with input/output pairs
turns: list[tuple[str | list[dict[str, Any]], str]] | None = None
# File search specific fields
file_content: str | None = None
file_path: str | None = None
# Streaming flag
stream: bool | None = None
# Basic response test cases
basic_test_cases = [
pytest.param(
ResponsesTestCase(
input="Humans live on which planet: Mars, Venus, or Earth?",
expected="earth",
),
id="earth",
),
pytest.param(
ResponsesTestCase(
input="Which planet has rings around it with a name starting with letter S?",
expected="saturn",
),
id="saturn",
),
pytest.param(
ResponsesTestCase(
input=[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "what teams are playing in this image?",
}
],
},
{
"role": "user",
"content": [
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg",
}
],
},
],
expected="brooklyn nets",
),
id="image_input",
),
]
# Multi-turn test cases
multi_turn_test_cases = [
pytest.param(
ResponsesTestCase(
input="", # Not used for multi-turn
expected="", # Not used for multi-turn
turns=[
("Humans live on which planet: Mars, Venus, or Earth?", "earth"),
("What is the name of the planet from your previous response?", "earth"),
],
),
id="earth",
),
]
# Web search test cases
web_search_test_cases = [
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "web_search", "search_context_size": "low"}],
expected="128",
),
id="llama_experts",
),
]
# File search test cases
file_search_test_cases = [
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search"}],
expected="128",
file_content="Llama 4 Maverick has 128 experts",
),
id="llama_experts",
),
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search"}],
expected="128",
file_path="pdfs/llama_stack_and_models.pdf",
),
id="llama_experts_pdf",
),
]
# MCP tool test cases
mcp_tool_test_cases = [
pytest.param(
ResponsesTestCase(
input="What is the boiling point of myawesomeliquid in Celsius?",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="Hello, world!",
),
id="boiling_point_tool",
),
]
# Custom tool test cases
custom_tool_test_cases = [
pytest.param(
ResponsesTestCase(
input="What's the weather like in San Francisco?",
tools=[
{
"type": "function",
"name": "get_weather",
"description": "Get current temperature for a given location.",
"parameters": {
"additionalProperties": False,
"properties": {
"location": {
"description": "City and country e.g. Bogotá, Colombia",
"type": "string",
}
},
"required": ["location"],
"type": "object",
},
}
],
expected="", # No specific expected output for custom tools
),
id="sf_weather",
),
]
# Image test cases
image_test_cases = [
pytest.param(
ResponsesTestCase(
input=[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "Identify the type of animal in this image.",
},
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
},
],
},
],
expected="llama",
),
id="llama_image",
),
]
# Multi-turn image test cases
multi_turn_image_test_cases = [
pytest.param(
ResponsesTestCase(
input="", # Not used for multi-turn
expected="", # Not used for multi-turn
turns=[
(
[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'.",
},
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
},
],
},
],
"llama",
),
(
"What country do you find this animal primarily in? What continent?",
"peru",
),
],
),
id="llama_image_understanding",
),
]
# Multi-turn tool execution test cases
multi_turn_tool_execution_test_cases = [
pytest.param(
ResponsesTestCase(
input="I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="yes",
),
id="user_file_access_check",
),
pytest.param(
ResponsesTestCase(
input="I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me the boiling point in Celsius.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="100°C",
),
id="experiment_results_lookup",
),
]
# Multi-turn tool execution streaming test cases
multi_turn_tool_execution_streaming_test_cases = [
pytest.param(
ResponsesTestCase(
input="Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="no",
stream=True,
),
id="user_permissions_workflow",
),
pytest.param(
ResponsesTestCase(
input="I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Return only one tool call per step. Please stream your analysis process.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="85%",
stream=True,
),
id="experiment_analysis_streaming",
),
]