refactor(tests): make the responses tests nicer (#3161)

# What does this PR do? A _bunch_ on cleanup for the Responses tests. - Got rid of YAML test cases, moved them to just use simple pydantic models - Splitting the large monolithic test file into multiple focused test files: - `test_basic_responses.py` for basic and image response tests - `test_tool_responses.py` for tool-related tests - `test_file_search.py` for file search specific tests - Adding a `StreamingValidator` helper class to standardize streaming response validation ## Test Plan Run the tests: ``` pytest -s -v tests/integration/non_ci/responses/ \ --stack-config=starter \ --text-model openai/gpt-4o \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ -k "client_with_models" ```
2025-12-05 10:23:44 +00:00 · 2025-08-15 00:05:36 +00:00 · 2025-08-15 00:05:36 +00:00 · 8ed69978f9
commit 8ed69978f9
parent ba664474de
11 changed files with 1312 additions and 1736 deletions
--- a/tests/integration/non_ci/responses/fixtures/test_cases.py
+++ b/tests/integration/non_ci/responses/fixtures/test_cases.py
@ -0,0 +1,262 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+import pytest
+from pydantic import BaseModel
+
+
+class ResponsesTestCase(BaseModel):
+    # Input can be a simple string or complex message structure
+    input: str | list[dict[str, Any]]
+    expected: str
+    # Tools as flexible dict structure (gets validated at runtime by the API)
+    tools: list[dict[str, Any]] | None = None
+    # Multi-turn conversations with input/output pairs
+    turns: list[tuple[str | list[dict[str, Any]], str]] | None = None
+    # File search specific fields
+    file_content: str | None = None
+    file_path: str | None = None
+    # Streaming flag
+    stream: bool | None = None
+
+
+# Basic response test cases
+basic_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="Which planet do humans live on?",
+            expected="earth",
+        ),
+        id="earth",
+    ),
+    pytest.param(
+        ResponsesTestCase(
+            input="Which planet has rings around it with a name starting with letter S?",
+            expected="saturn",
+        ),
+        id="saturn",
+    ),
+    pytest.param(
+        ResponsesTestCase(
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": "what teams are playing in this image?",
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_image",
+                            "image_url": "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg",
+                        }
+                    ],
+                },
+            ],
+            expected="brooklyn nets",
+        ),
+        id="image_input",
+    ),
+]
+
+# Multi-turn test cases
+multi_turn_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="",  # Not used for multi-turn
+            expected="",  # Not used for multi-turn
+            turns=[
+                ("Which planet do humans live on?", "earth"),
+                ("What is the name of the planet from your previous response?", "earth"),
+            ],
+        ),
+        id="earth",
+    ),
+]
+
+# Web search test cases
+web_search_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="How many experts does the Llama 4 Maverick model have?",
+            tools=[{"type": "web_search", "search_context_size": "low"}],
+            expected="128",
+        ),
+        id="llama_experts",
+    ),
+]
+
+# File search test cases
+file_search_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="How many experts does the Llama 4 Maverick model have?",
+            tools=[{"type": "file_search"}],
+            expected="128",
+            file_content="Llama 4 Maverick has 128 experts",
+        ),
+        id="llama_experts",
+    ),
+    pytest.param(
+        ResponsesTestCase(
+            input="How many experts does the Llama 4 Maverick model have?",
+            tools=[{"type": "file_search"}],
+            expected="128",
+            file_path="pdfs/llama_stack_and_models.pdf",
+        ),
+        id="llama_experts_pdf",
+    ),
+]
+
+# MCP tool test cases
+mcp_tool_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="What is the boiling point of myawesomeliquid in Celsius?",
+            tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
+            expected="Hello, world!",
+        ),
+        id="boiling_point_tool",
+    ),
+]
+
+# Custom tool test cases
+custom_tool_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="What's the weather like in San Francisco?",
+            tools=[
+                {
+                    "type": "function",
+                    "name": "get_weather",
+                    "description": "Get current temperature for a given location.",
+                    "parameters": {
+                        "additionalProperties": False,
+                        "properties": {
+                            "location": {
+                                "description": "City and country e.g. Bogotá, Colombia",
+                                "type": "string",
+                            }
+                        },
+                        "required": ["location"],
+                        "type": "object",
+                    },
+                }
+            ],
+            expected="",  # No specific expected output for custom tools
+        ),
+        id="sf_weather",
+    ),
+]
+
+# Image test cases
+image_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": "Identify the type of animal in this image.",
+                        },
+                        {
+                            "type": "input_image",
+                            "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
+                        },
+                    ],
+                },
+            ],
+            expected="llama",
+        ),
+        id="llama_image",
+    ),
+]
+
+# Multi-turn image test cases
+multi_turn_image_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="",  # Not used for multi-turn
+            expected="",  # Not used for multi-turn
+            turns=[
+                (
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "input_text",
+                                    "text": "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'.",
+                                },
+                                {
+                                    "type": "input_image",
+                                    "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
+                                },
+                            ],
+                        },
+                    ],
+                    "llama",
+                ),
+                (
+                    "What country do you find this animal primarily in? What continent?",
+                    "peru",
+                ),
+            ],
+        ),
+        id="llama_image_understanding",
+    ),
+]
+
+# Multi-turn tool execution test cases
+multi_turn_tool_execution_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
+            tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
+            expected="yes",
+        ),
+        id="user_file_access_check",
+    ),
+    pytest.param(
+        ResponsesTestCase(
+            input="I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me the boiling point in Celsius.",
+            tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
+            expected="100°C",
+        ),
+        id="experiment_results_lookup",
+    ),
+]
+
+# Multi-turn tool execution streaming test cases
+multi_turn_tool_execution_streaming_test_cases = [
+    pytest.param(
+        ResponsesTestCase(
+            input="Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
+            tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
+            expected="no",
+            stream=True,
+        ),
+        id="user_permissions_workflow",
+    ),
+    pytest.param(
+        ResponsesTestCase(
+            input="I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Return only one tool call per step.  Please stream your analysis process.",
+            tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
+            expected="85%",
+            stream=True,
+        ),
+        id="experiment_analysis_streaming",
+    ),
+]