llama-stack-mirror/tests/integration/non_ci/responses/fixtures/test_cases.py
ashwinb 8ed69978f9
refactor(tests): make the responses tests nicer (#3161)
# What does this PR do?

A _bunch_ on cleanup for the Responses tests.

- Got rid of YAML test cases, moved them to just use simple pydantic models
- Splitting the large monolithic test file into multiple focused test files:
   - `test_basic_responses.py` for basic and image response tests
   - `test_tool_responses.py` for tool-related tests
   - `test_file_search.py` for file search specific tests
- Adding a `StreamingValidator` helper class to standardize streaming response validation

## Test Plan

Run the tests:

```
pytest -s -v tests/integration/non_ci/responses/ \
   --stack-config=starter \
   --text-model openai/gpt-4o \
   --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
    -k "client_with_models"
```
2025-08-15 00:05:36 +00:00

262 lines
9.1 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any
import pytest
from pydantic import BaseModel
class ResponsesTestCase(BaseModel):
# Input can be a simple string or complex message structure
input: str | list[dict[str, Any]]
expected: str
# Tools as flexible dict structure (gets validated at runtime by the API)
tools: list[dict[str, Any]] | None = None
# Multi-turn conversations with input/output pairs
turns: list[tuple[str | list[dict[str, Any]], str]] | None = None
# File search specific fields
file_content: str | None = None
file_path: str | None = None
# Streaming flag
stream: bool | None = None
# Basic response test cases
basic_test_cases = [
pytest.param(
ResponsesTestCase(
input="Which planet do humans live on?",
expected="earth",
),
id="earth",
),
pytest.param(
ResponsesTestCase(
input="Which planet has rings around it with a name starting with letter S?",
expected="saturn",
),
id="saturn",
),
pytest.param(
ResponsesTestCase(
input=[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "what teams are playing in this image?",
}
],
},
{
"role": "user",
"content": [
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg",
}
],
},
],
expected="brooklyn nets",
),
id="image_input",
),
]
# Multi-turn test cases
multi_turn_test_cases = [
pytest.param(
ResponsesTestCase(
input="", # Not used for multi-turn
expected="", # Not used for multi-turn
turns=[
("Which planet do humans live on?", "earth"),
("What is the name of the planet from your previous response?", "earth"),
],
),
id="earth",
),
]
# Web search test cases
web_search_test_cases = [
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "web_search", "search_context_size": "low"}],
expected="128",
),
id="llama_experts",
),
]
# File search test cases
file_search_test_cases = [
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search"}],
expected="128",
file_content="Llama 4 Maverick has 128 experts",
),
id="llama_experts",
),
pytest.param(
ResponsesTestCase(
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search"}],
expected="128",
file_path="pdfs/llama_stack_and_models.pdf",
),
id="llama_experts_pdf",
),
]
# MCP tool test cases
mcp_tool_test_cases = [
pytest.param(
ResponsesTestCase(
input="What is the boiling point of myawesomeliquid in Celsius?",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="Hello, world!",
),
id="boiling_point_tool",
),
]
# Custom tool test cases
custom_tool_test_cases = [
pytest.param(
ResponsesTestCase(
input="What's the weather like in San Francisco?",
tools=[
{
"type": "function",
"name": "get_weather",
"description": "Get current temperature for a given location.",
"parameters": {
"additionalProperties": False,
"properties": {
"location": {
"description": "City and country e.g. Bogotá, Colombia",
"type": "string",
}
},
"required": ["location"],
"type": "object",
},
}
],
expected="", # No specific expected output for custom tools
),
id="sf_weather",
),
]
# Image test cases
image_test_cases = [
pytest.param(
ResponsesTestCase(
input=[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "Identify the type of animal in this image.",
},
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
},
],
},
],
expected="llama",
),
id="llama_image",
),
]
# Multi-turn image test cases
multi_turn_image_test_cases = [
pytest.param(
ResponsesTestCase(
input="", # Not used for multi-turn
expected="", # Not used for multi-turn
turns=[
(
[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'.",
},
{
"type": "input_image",
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
},
],
},
],
"llama",
),
(
"What country do you find this animal primarily in? What continent?",
"peru",
),
],
),
id="llama_image_understanding",
),
]
# Multi-turn tool execution test cases
multi_turn_tool_execution_test_cases = [
pytest.param(
ResponsesTestCase(
input="I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="yes",
),
id="user_file_access_check",
),
pytest.param(
ResponsesTestCase(
input="I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me the boiling point in Celsius.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="100°C",
),
id="experiment_results_lookup",
),
]
# Multi-turn tool execution streaming test cases
multi_turn_tool_execution_streaming_test_cases = [
pytest.param(
ResponsesTestCase(
input="Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="no",
stream=True,
),
id="user_permissions_workflow",
),
pytest.param(
ResponsesTestCase(
input="I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Return only one tool call per step. Please stream your analysis process.",
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
expected="85%",
stream=True,
),
id="experiment_analysis_streaming",
),
]