From 207224a8113d89e8abb8db016c630d79c0bbd330 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Fri, 18 Apr 2025 15:26:34 -0400
Subject: [PATCH] OpenAPI Responses - move tests under tests/verifications

This moves the OpenAI Responses API tests under
tests/verifications/openai_api/test_response.py and starts to wire
them up to our verification suite, so that we can test multiple
providers as well as OpenAI directly for the Responses API.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../apis/openai_responses/openai_responses.py |  19 +-
 .../openai_responses/openai_responses.py      |  59 ++++---
 .../integration/openai_responses/__init__.py  |   5 -
 .../openai_responses/test_basic.py            |  83 ---------
 .../test_web_search_builtin.py                | 101 -----------
 .../conf/fireworks-llama-stack.yaml           |   2 +
 .../verifications/conf/groq-llama-stack.yaml  |   2 +
 .../conf/together-llama-stack.yaml            |   2 +
 tests/verifications/generate_report.py        |   2 +-
 tests/verifications/openai_api/conftest.py    |  35 ++++
 .../openai_api/fixtures/fixtures.py           |  29 ++-
 .../fixtures/test_cases/response.yaml         |  65 +++++++
 .../openai_api/test_chat_completion.py        |  56 +-----
 .../verifications/openai_api/test_response.py | 166 ++++++++++++++++++
 14 files changed, 353 insertions(+), 273 deletions(-)
 delete mode 100644 tests/integration/openai_responses/__init__.py
 delete mode 100644 tests/integration/openai_responses/test_basic.py
 delete mode 100644 tests/integration/openai_responses/test_web_search_builtin.py
 create mode 100644 tests/verifications/openai_api/conftest.py
 create mode 100644 tests/verifications/openai_api/fixtures/test_cases/response.yaml
 create mode 100644 tests/verifications/openai_api/test_response.py

diff --git a/llama_stack/apis/openai_responses/openai_responses.py b/llama_stack/apis/openai_responses/openai_responses.py
index 87ccfdabd..0b21f3f28 100644
--- a/llama_stack/apis/openai_responses/openai_responses.py
+++ b/llama_stack/apis/openai_responses/openai_responses.py
@@ -75,11 +75,27 @@ class OpenAIResponseObject(BaseModel):
 
 
 @json_schema_type
-class OpenAIResponseObjectStream(BaseModel):
+class OpenAIResponseObjectStreamResponseCreated(BaseModel):
     response: OpenAIResponseObject
     type: Literal["response.created"] = "response.created"
 
 
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.completed"] = "response.completed"
+
+
+OpenAIResponseObjectStream = Annotated[
+    Union[
+        OpenAIResponseObjectStreamResponseCreated,
+        OpenAIResponseObjectStreamResponseCompleted,
+    ],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
+
+
 @json_schema_type
 class OpenAIResponseInputMessageContentText(BaseModel):
     text: str
@@ -112,6 +128,7 @@ class OpenAIResponseInputMessage(BaseModel):
 @json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
     type: Literal["web_search", "web_search_preview_2025_03_11"] = "web_search"
+    # TODO: actually use search_context_size somewhere...
     search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$")
     # TODO: add user_location
 
diff --git a/llama_stack/providers/inline/openai_responses/openai_responses.py b/llama_stack/providers/inline/openai_responses/openai_responses.py
index 5f5df6ad0..c7d767f73 100644
--- a/llama_stack/providers/inline/openai_responses/openai_responses.py
+++ b/llama_stack/providers/inline/openai_responses/openai_responses.py
@@ -33,6 +33,8 @@ from llama_stack.apis.openai_responses.openai_responses import (
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
+    OpenAIResponseObjectStreamResponseCompleted,
+    OpenAIResponseObjectStreamResponseCreated,
     OpenAIResponseOutput,
     OpenAIResponseOutputMessage,
     OpenAIResponseOutputMessageContentOutputText,
@@ -174,7 +176,8 @@ class OpenAIResponsesImpl(OpenAIResponses):
                 for chunk_choice in chunk.choices:
                     # TODO: this only works for text content
                     chat_response_content.append(chunk_choice.delta.content or "")
-                    chunk_finish_reason = chunk_choice.finish_reason
+                    if chunk_choice.finish_reason:
+                        chunk_finish_reason = chunk_choice.finish_reason
             assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content))
             chat_response = OpenAIChatCompletion(
                 id=chat_response_id,
@@ -219,7 +222,9 @@ class OpenAIResponsesImpl(OpenAIResponses):
         if stream:
 
             async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]:
-                yield OpenAIResponseObjectStream(response=response)
+                # TODO: response created should actually get emitted much earlier in the process
+                yield OpenAIResponseObjectStreamResponseCreated(response=response)
+                yield OpenAIResponseObjectStreamResponseCompleted(response=response)
 
             return async_response()
 
@@ -270,40 +275,40 @@ class OpenAIResponsesImpl(OpenAIResponses):
         # Add the assistant message with tool_calls response to the messages list
         messages.append(choice.message)
 
-        # TODO: handle multiple tool calls
-        tool_call = choice.message.tool_calls[0]
-        tool_call_id = tool_call.id
-        function = tool_call.function
+        for tool_call in choice.message.tool_calls:
+            tool_call_id = tool_call.id
+            function = tool_call.function
 
-        # If for some reason the tool call doesn't have a function or id, we can't execute it
-        if not function or not tool_call_id:
-            return output_messages
+            # If for some reason the tool call doesn't have a function or id, we can't execute it
+            if not function or not tool_call_id:
+                continue
 
-        # TODO: telemetry spans for tool calls
-        result = await self._execute_tool_call(function)
+            # TODO: telemetry spans for tool calls
+            result = await self._execute_tool_call(function)
+
+            # Handle tool call failure
+            if not result:
+                output_messages.append(
+                    OpenAIResponseOutputMessageWebSearchToolCall(
+                        id=tool_call_id,
+                        status="failed",
+                    )
+                )
+                continue
 
-        # Handle tool call failure
-        if not result:
             output_messages.append(
                 OpenAIResponseOutputMessageWebSearchToolCall(
                     id=tool_call_id,
-                    status="failed",
-                )
+                    status="completed",
+                ),
             )
-            return output_messages
 
-        output_messages.append(
-            OpenAIResponseOutputMessageWebSearchToolCall(
-                id=tool_call_id,
-                status="completed",
-            ),
-        )
+            result_content = ""
+            # TODO: handle other result content types and lists
+            if isinstance(result.content, str):
+                result_content = result.content
+            messages.append(OpenAIToolMessageParam(content=result_content, tool_call_id=tool_call_id))
 
-        result_content = ""
-        # TODO: handle other result content types and lists
-        if isinstance(result.content, str):
-            result_content = result.content
-        messages.append(OpenAIToolMessageParam(content=result_content, tool_call_id=tool_call_id))
         tool_results_chat_response = await self.inference_api.openai_chat_completion(
             model=model_id,
             messages=messages,
diff --git a/tests/integration/openai_responses/__init__.py b/tests/integration/openai_responses/__init__.py
deleted file mode 100644
index 756f351d8..000000000
--- a/tests/integration/openai_responses/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/openai_responses/test_basic.py b/tests/integration/openai_responses/test_basic.py
deleted file mode 100644
index 49e94388b..000000000
--- a/tests/integration/openai_responses/test_basic.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from ..test_cases.test_case import TestCase
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "openai:responses:non_streaming_01",
-        "openai:responses:non_streaming_02",
-    ],
-)
-def test_basic_non_streaming(openai_client, client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = openai_client.responses.create(
-        model=text_model_id,
-        input=question,
-        stream=False,
-    )
-    output_text = response.output_text.lower().strip()
-    assert len(output_text) > 0
-    assert expected.lower() in output_text
-
-    retrieved_response = openai_client.responses.retrieve(response_id=response.id)
-    assert retrieved_response.output_text == response.output_text
-
-    next_response = openai_client.responses.create(
-        model=text_model_id, input="Repeat your previous response in all caps.", previous_response_id=response.id
-    )
-    next_output_text = next_response.output_text.strip()
-    assert expected.upper() in next_output_text
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "openai:responses:streaming_01",
-        "openai:responses:streaming_02",
-    ],
-)
-def test_basic_streaming(openai_client, client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = openai_client.responses.create(
-        model=text_model_id,
-        input=question,
-        stream=True,
-        timeout=120,  # Increase timeout to 2 minutes for large conversation history
-    )
-    streamed_content = []
-    response_id = ""
-    for chunk in response:
-        response_id = chunk.response.id
-        streamed_content.append(chunk.response.output_text.strip())
-
-    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content).lower()
-
-    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
-    assert retrieved_response.output_text == "".join(streamed_content)
-
-    next_response = openai_client.responses.create(
-        model=text_model_id,
-        input="Repeat your previous response in all caps.",
-        previous_response_id=response_id,
-        stream=True,
-    )
-    next_streamed_content = []
-    for chunk in next_response:
-        next_streamed_content.append(chunk.response.output_text.strip())
-    assert expected.upper() in "".join(next_streamed_content)
diff --git a/tests/integration/openai_responses/test_web_search_builtin.py b/tests/integration/openai_responses/test_web_search_builtin.py
deleted file mode 100644
index 5f8a1afd7..000000000
--- a/tests/integration/openai_responses/test_web_search_builtin.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from ..test_cases.test_case import TestCase
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "openai:responses:tools_web_search_01",
-    ],
-)
-def test_web_search_non_streaming(openai_client, client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    input = tc["input"]
-    expected = tc["expected"]
-    tools = tc["tools"]
-
-    response = openai_client.responses.create(
-        model=text_model_id,
-        input=input,
-        tools=tools,
-        stream=False,
-    )
-    assert len(response.output) > 1
-    assert response.output[0].type == "web_search_call"
-    assert response.output[0].status == "completed"
-    assert response.output[1].type == "message"
-    assert response.output[1].status == "completed"
-    assert response.output[1].role == "assistant"
-    assert len(response.output[1].content) > 0
-    assert expected.lower() in response.output_text.lower().strip()
-
-
-def test_input_image_non_streaming(openai_client, vision_model_id):
-    supported_models = ["llama-4", "gpt-4o", "llama4"]
-    if not any(model in vision_model_id.lower() for model in supported_models):
-        pytest.skip(f"Skip for non-supported model: {vision_model_id}")
-
-    response = openai_client.with_options(max_retries=0).responses.create(
-        model=vision_model_id,
-        input=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "input_text",
-                        "text": "Identify the type of animal in this image.",
-                    },
-                    {
-                        "type": "input_image",
-                        "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
-                    },
-                ],
-            }
-        ],
-    )
-    output_text = response.output_text.lower()
-    assert "llama" in output_text
-
-
-def test_multi_turn_web_search_from_image_non_streaming(openai_client, vision_model_id):
-    supported_models = ["llama-4", "gpt-4o", "llama4"]
-    if not any(model in vision_model_id.lower() for model in supported_models):
-        pytest.skip(f"Skip for non-supported model: {vision_model_id}")
-
-    response = openai_client.with_options(max_retries=0).responses.create(
-        model=vision_model_id,
-        input=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "input_text",
-                        "text": "Extract a single search keyword that represents the type of animal in this image.",
-                    },
-                    {
-                        "type": "input_image",
-                        "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
-                    },
-                ],
-            }
-        ],
-    )
-    output_text = response.output_text.lower()
-    assert "llama" in output_text
-
-    search_response = openai_client.with_options(max_retries=0).responses.create(
-        model=vision_model_id,
-        input="Search the web using the search tool for those keywords plus the words 'maverick' and 'scout' and summarize the results.",
-        previous_response_id=response.id,
-        tools=[{"type": "web_search"}],
-    )
-    output_text = search_response.output_text.lower()
-    assert "model" in output_text
diff --git a/tests/verifications/conf/fireworks-llama-stack.yaml b/tests/verifications/conf/fireworks-llama-stack.yaml
index fc78a1377..dffd7c739 100644
--- a/tests/verifications/conf/fireworks-llama-stack.yaml
+++ b/tests/verifications/conf/fireworks-llama-stack.yaml
@@ -13,3 +13,5 @@ test_exclusions:
   - test_chat_non_streaming_image
   - test_chat_streaming_image
   - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
diff --git a/tests/verifications/conf/groq-llama-stack.yaml b/tests/verifications/conf/groq-llama-stack.yaml
index 6958bafc5..786b79c24 100644
--- a/tests/verifications/conf/groq-llama-stack.yaml
+++ b/tests/verifications/conf/groq-llama-stack.yaml
@@ -13,3 +13,5 @@ test_exclusions:
   - test_chat_non_streaming_image
   - test_chat_streaming_image
   - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
diff --git a/tests/verifications/conf/together-llama-stack.yaml b/tests/verifications/conf/together-llama-stack.yaml
index 719e2d776..58cbcfa93 100644
--- a/tests/verifications/conf/together-llama-stack.yaml
+++ b/tests/verifications/conf/together-llama-stack.yaml
@@ -13,3 +13,5 @@ test_exclusions:
   - test_chat_non_streaming_image
   - test_chat_streaming_image
   - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
diff --git a/tests/verifications/generate_report.py b/tests/verifications/generate_report.py
index f0894bfce..bdaea3ebf 100755
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@@ -16,7 +16,7 @@ Description:
 
 
 Configuration:
-    - Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
+    - Provider details (models, display names) are loaded from `tests/verifications/conf/*.yaml`.
     - Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
     - Test results are stored in `tests/verifications/test_results/`.
 
diff --git a/tests/verifications/openai_api/conftest.py b/tests/verifications/openai_api/conftest.py
new file mode 100644
index 000000000..7b4c92f1c
--- /dev/null
+++ b/tests/verifications/openai_api/conftest.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
+
+
+def pytest_generate_tests(metafunc):
+    """Dynamically parametrize tests based on the selected provider and config."""
+    if "model" in metafunc.fixturenames:
+        provider = metafunc.config.getoption("provider")
+        if not provider:
+            print("Warning: --provider not specified. Skipping model parametrization.")
+            metafunc.parametrize("model", [])
+            return
+
+        try:
+            config_data = _load_all_verification_configs()
+        except (FileNotFoundError, IOError) as e:
+            print(f"ERROR loading verification configs: {e}")
+            config_data = {"providers": {}}
+
+        provider_config = config_data.get("providers", {}).get(provider)
+        if provider_config:
+            models = provider_config.get("models", [])
+            if models:
+                metafunc.parametrize("model", models)
+            else:
+                print(f"Warning: No models found for provider '{provider}' in config.")
+                metafunc.parametrize("model", [])  # Parametrize empty if no models found
+        else:
+            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
+            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
diff --git a/tests/verifications/openai_api/fixtures/fixtures.py b/tests/verifications/openai_api/fixtures/fixtures.py
index 940b99b2a..2ea73cf26 100644
--- a/tests/verifications/openai_api/fixtures/fixtures.py
+++ b/tests/verifications/openai_api/fixtures/fixtures.py
@@ -5,14 +5,16 @@
 # the root directory of this source tree.
 
 import os
+import re
 from pathlib import Path
 
 import pytest
 import yaml
 from openai import OpenAI
 
+# --- Helper Functions ---
+
 
-# --- Helper Function to Load Config ---
 def _load_all_verification_configs():
     """Load and aggregate verification configs from the conf/ directory."""
     # Note: Path is relative to *this* file (fixtures.py)
@@ -44,7 +46,30 @@ def _load_all_verification_configs():
     return {"providers": all_provider_configs}
 
 
-# --- End Helper Function ---
+def case_id_generator(case):
+    """Generate a test ID from the case's 'case_id' field, or use a default."""
+    case_id = case.get("case_id")
+    if isinstance(case_id, (str, int)):
+        return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
+    return None
+
+
+def should_skip_test(verification_config, provider, model, test_name_base):
+    """Check if a test should be skipped based on config exclusions."""
+    provider_config = verification_config.get("providers", {}).get(provider)
+    if not provider_config:
+        return False  # No config for provider, don't skip
+
+    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
+    return test_name_base in exclusions
+
+
+# Helper to get the base test name from the request object
+def get_base_test_name(request):
+    return request.node.originalname
+
+
+# --- End Helper Functions ---
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/verifications/openai_api/fixtures/test_cases/response.yaml b/tests/verifications/openai_api/fixtures/test_cases/response.yaml
new file mode 100644
index 000000000..f235b2ea8
--- /dev/null
+++ b/tests/verifications/openai_api/fixtures/test_cases/response.yaml
@@ -0,0 +1,65 @@
+test_response_basic:
+  test_name: test_response_basic
+  test_params:
+    case:
+    - case_id: "earth"
+      input: "Which planet do humans live on?"
+      output: "earth"
+    - case_id: "saturn"
+      input: "Which planet has rings around it with a name starting with letter S?"
+      output: "saturn"
+
+test_response_multi_turn:
+  test_name: test_response_multi_turn
+  test_params:
+    case:
+    - case_id: "earth"
+      turns:
+      - input: "Which planet do humans live on?"
+        output: "earth"
+      - input: "What is the name of the planet from your previous response?"
+        output: "earth"
+
+test_response_web_search:
+  test_name: test_response_web_search
+  test_params:
+    case:
+    - case_id: "llama_experts"
+      input: "How many experts does the Llama 4 Maverick model have?"
+      tools:
+      - type: web_search
+        search_context_size: "low"
+      output: "128"
+
+test_response_image:
+  test_name: test_response_image
+  test_params:
+    case:
+    - case_id: "llama_image"
+      input:
+      - role: user
+        content:
+        - type: input_text
+          text: "Identify the type of animal in this image."
+        - type: input_image
+          image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
+      output: "llama"
+
+test_response_multi_turn_image:
+  test_name: test_response_multi_turn_image
+  test_params:
+    case:
+    - case_id: "llama_image_search"
+      turns:
+      - input:
+        - role: user
+          content:
+          - type: input_text
+            text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
+          - type: input_image
+            image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
+        output: "llama"
+      - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick' and 'scout'."
+        tools:
+        - type: web_search
+        output: "model"
diff --git a/tests/verifications/openai_api/test_chat_completion.py b/tests/verifications/openai_api/test_chat_completion.py
index 277eaafa3..64e49d352 100644
--- a/tests/verifications/openai_api/test_chat_completion.py
+++ b/tests/verifications/openai_api/test_chat_completion.py
@@ -7,7 +7,6 @@
 import base64
 import copy
 import json
-import re
 from pathlib import Path
 from typing import Any
 
@@ -16,7 +15,9 @@ from openai import APIError
 from pydantic import BaseModel
 
 from tests.verifications.openai_api.fixtures.fixtures import (
-    _load_all_verification_configs,
+    case_id_generator,
+    get_base_test_name,
+    should_skip_test,
 )
 from tests.verifications.openai_api.fixtures.load import load_test_cases
 
@@ -25,57 +26,6 @@ chat_completion_test_cases = load_test_cases("chat_completion")
 THIS_DIR = Path(__file__).parent
 
 
-def case_id_generator(case):
-    """Generate a test ID from the case's 'case_id' field, or use a default."""
-    case_id = case.get("case_id")
-    if isinstance(case_id, (str, int)):
-        return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
-    return None
-
-
-def pytest_generate_tests(metafunc):
-    """Dynamically parametrize tests based on the selected provider and config."""
-    if "model" in metafunc.fixturenames:
-        provider = metafunc.config.getoption("provider")
-        if not provider:
-            print("Warning: --provider not specified. Skipping model parametrization.")
-            metafunc.parametrize("model", [])
-            return
-
-        try:
-            config_data = _load_all_verification_configs()
-        except (FileNotFoundError, IOError) as e:
-            print(f"ERROR loading verification configs: {e}")
-            config_data = {"providers": {}}
-
-        provider_config = config_data.get("providers", {}).get(provider)
-        if provider_config:
-            models = provider_config.get("models", [])
-            if models:
-                metafunc.parametrize("model", models)
-            else:
-                print(f"Warning: No models found for provider '{provider}' in config.")
-                metafunc.parametrize("model", [])  # Parametrize empty if no models found
-        else:
-            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
-            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
-
-
-def should_skip_test(verification_config, provider, model, test_name_base):
-    """Check if a test should be skipped based on config exclusions."""
-    provider_config = verification_config.get("providers", {}).get(provider)
-    if not provider_config:
-        return False  # No config for provider, don't skip
-
-    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
-    return test_name_base in exclusions
-
-
-# Helper to get the base test name from the request object
-def get_base_test_name(request):
-    return request.node.originalname
-
-
 @pytest.fixture
 def multi_image_data():
     files = [
diff --git a/tests/verifications/openai_api/test_response.py b/tests/verifications/openai_api/test_response.py
new file mode 100644
index 000000000..d4bbb526a
--- /dev/null
+++ b/tests/verifications/openai_api/test_response.py
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+
+from tests.verifications.openai_api.fixtures.fixtures import (
+    case_id_generator,
+    get_base_test_name,
+    should_skip_test,
+)
+from tests.verifications.openai_api.fixtures.load import load_test_cases
+
+response_test_cases = load_test_cases("response")
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=False,
+    )
+    output_text = response.output_text.lower().strip()
+    assert len(output_text) > 0
+    assert case["output"].lower() in output_text
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response.id)
+    assert retrieved_response.output_text == response.output_text
+
+    next_response = openai_client.responses.create(
+        model=model, input="Repeat your previous response in all caps.", previous_response_id=response.id
+    )
+    next_output_text = next_response.output_text.strip()
+    assert case["output"].upper() in next_output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=True,
+    )
+    streamed_content = []
+    response_id = ""
+    for chunk in response:
+        if chunk.type == "response.completed":
+            response_id = chunk.response.id
+            streamed_content.append(chunk.response.output_text.strip())
+
+    assert len(streamed_content) > 0
+    assert case["output"].lower() in "".join(streamed_content).lower()
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
+    assert retrieved_response.output_text == "".join(streamed_content)
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_multi_turn"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_multi_turn(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    previous_response_id = None
+    for turn in case["turns"]:
+        response = openai_client.responses.create(
+            model=model,
+            input=turn["input"],
+            previous_response_id=previous_response_id,
+            tools=turn["tools"] if "tools" in turn else None,
+        )
+        previous_response_id = response.id
+        output_text = response.output_text.lower()
+        assert turn["output"].lower() in output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_web_search"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_web_search(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        tools=case["tools"],
+        stream=False,
+    )
+    assert len(response.output) > 1
+    assert response.output[0].type == "web_search_call"
+    assert response.output[0].status == "completed"
+    assert response.output[1].type == "message"
+    assert response.output[1].status == "completed"
+    assert response.output[1].role == "assistant"
+    assert len(response.output[1].content) > 0
+    assert case["output"].lower() in response.output_text.lower().strip()
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=False,
+    )
+    output_text = response.output_text.lower()
+    assert case["output"].lower() in output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    response_test_cases["test_response_multi_turn_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_multi_turn_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    previous_response_id = None
+    for turn in case["turns"]:
+        response = openai_client.responses.create(
+            model=model,
+            input=turn["input"],
+            previous_response_id=previous_response_id,
+            tools=turn["tools"] if "tools" in turn else None,
+        )
+        previous_response_id = response.id
+        output_text = response.output_text.lower()
+        assert turn["output"].lower() in output_text