From 207224a8113d89e8abb8db016c630d79c0bbd330 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Fri, 18 Apr 2025 15:26:34 -0400 Subject: [PATCH] OpenAPI Responses - move tests under tests/verifications This moves the OpenAI Responses API tests under tests/verifications/openai_api/test_response.py and starts to wire them up to our verification suite, so that we can test multiple providers as well as OpenAI directly for the Responses API. Signed-off-by: Ben Browning --- .../apis/openai_responses/openai_responses.py | 19 +- .../openai_responses/openai_responses.py | 59 ++++--- .../integration/openai_responses/__init__.py | 5 - .../openai_responses/test_basic.py | 83 --------- .../test_web_search_builtin.py | 101 ----------- .../conf/fireworks-llama-stack.yaml | 2 + .../verifications/conf/groq-llama-stack.yaml | 2 + .../conf/together-llama-stack.yaml | 2 + tests/verifications/generate_report.py | 2 +- tests/verifications/openai_api/conftest.py | 35 ++++ .../openai_api/fixtures/fixtures.py | 29 ++- .../fixtures/test_cases/response.yaml | 65 +++++++ .../openai_api/test_chat_completion.py | 56 +----- .../verifications/openai_api/test_response.py | 166 ++++++++++++++++++ 14 files changed, 353 insertions(+), 273 deletions(-) delete mode 100644 tests/integration/openai_responses/__init__.py delete mode 100644 tests/integration/openai_responses/test_basic.py delete mode 100644 tests/integration/openai_responses/test_web_search_builtin.py create mode 100644 tests/verifications/openai_api/conftest.py create mode 100644 tests/verifications/openai_api/fixtures/test_cases/response.yaml create mode 100644 tests/verifications/openai_api/test_response.py diff --git a/llama_stack/apis/openai_responses/openai_responses.py b/llama_stack/apis/openai_responses/openai_responses.py index 87ccfdabd..0b21f3f28 100644 --- a/llama_stack/apis/openai_responses/openai_responses.py +++ b/llama_stack/apis/openai_responses/openai_responses.py @@ -75,11 +75,27 @@ class OpenAIResponseObject(BaseModel): @json_schema_type -class OpenAIResponseObjectStream(BaseModel): +class OpenAIResponseObjectStreamResponseCreated(BaseModel): response: OpenAIResponseObject type: Literal["response.created"] = "response.created" +@json_schema_type +class OpenAIResponseObjectStreamResponseCompleted(BaseModel): + response: OpenAIResponseObject + type: Literal["response.completed"] = "response.completed" + + +OpenAIResponseObjectStream = Annotated[ + Union[ + OpenAIResponseObjectStreamResponseCreated, + OpenAIResponseObjectStreamResponseCompleted, + ], + Field(discriminator="type"), +] +register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream") + + @json_schema_type class OpenAIResponseInputMessageContentText(BaseModel): text: str @@ -112,6 +128,7 @@ class OpenAIResponseInputMessage(BaseModel): @json_schema_type class OpenAIResponseInputToolWebSearch(BaseModel): type: Literal["web_search", "web_search_preview_2025_03_11"] = "web_search" + # TODO: actually use search_context_size somewhere... search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$") # TODO: add user_location diff --git a/llama_stack/providers/inline/openai_responses/openai_responses.py b/llama_stack/providers/inline/openai_responses/openai_responses.py index 5f5df6ad0..c7d767f73 100644 --- a/llama_stack/providers/inline/openai_responses/openai_responses.py +++ b/llama_stack/providers/inline/openai_responses/openai_responses.py @@ -33,6 +33,8 @@ from llama_stack.apis.openai_responses.openai_responses import ( OpenAIResponseInputTool, OpenAIResponseObject, OpenAIResponseObjectStream, + OpenAIResponseObjectStreamResponseCompleted, + OpenAIResponseObjectStreamResponseCreated, OpenAIResponseOutput, OpenAIResponseOutputMessage, OpenAIResponseOutputMessageContentOutputText, @@ -174,7 +176,8 @@ class OpenAIResponsesImpl(OpenAIResponses): for chunk_choice in chunk.choices: # TODO: this only works for text content chat_response_content.append(chunk_choice.delta.content or "") - chunk_finish_reason = chunk_choice.finish_reason + if chunk_choice.finish_reason: + chunk_finish_reason = chunk_choice.finish_reason assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content)) chat_response = OpenAIChatCompletion( id=chat_response_id, @@ -219,7 +222,9 @@ class OpenAIResponsesImpl(OpenAIResponses): if stream: async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]: - yield OpenAIResponseObjectStream(response=response) + # TODO: response created should actually get emitted much earlier in the process + yield OpenAIResponseObjectStreamResponseCreated(response=response) + yield OpenAIResponseObjectStreamResponseCompleted(response=response) return async_response() @@ -270,40 +275,40 @@ class OpenAIResponsesImpl(OpenAIResponses): # Add the assistant message with tool_calls response to the messages list messages.append(choice.message) - # TODO: handle multiple tool calls - tool_call = choice.message.tool_calls[0] - tool_call_id = tool_call.id - function = tool_call.function + for tool_call in choice.message.tool_calls: + tool_call_id = tool_call.id + function = tool_call.function - # If for some reason the tool call doesn't have a function or id, we can't execute it - if not function or not tool_call_id: - return output_messages + # If for some reason the tool call doesn't have a function or id, we can't execute it + if not function or not tool_call_id: + continue - # TODO: telemetry spans for tool calls - result = await self._execute_tool_call(function) + # TODO: telemetry spans for tool calls + result = await self._execute_tool_call(function) + + # Handle tool call failure + if not result: + output_messages.append( + OpenAIResponseOutputMessageWebSearchToolCall( + id=tool_call_id, + status="failed", + ) + ) + continue - # Handle tool call failure - if not result: output_messages.append( OpenAIResponseOutputMessageWebSearchToolCall( id=tool_call_id, - status="failed", - ) + status="completed", + ), ) - return output_messages - output_messages.append( - OpenAIResponseOutputMessageWebSearchToolCall( - id=tool_call_id, - status="completed", - ), - ) + result_content = "" + # TODO: handle other result content types and lists + if isinstance(result.content, str): + result_content = result.content + messages.append(OpenAIToolMessageParam(content=result_content, tool_call_id=tool_call_id)) - result_content = "" - # TODO: handle other result content types and lists - if isinstance(result.content, str): - result_content = result.content - messages.append(OpenAIToolMessageParam(content=result_content, tool_call_id=tool_call_id)) tool_results_chat_response = await self.inference_api.openai_chat_completion( model=model_id, messages=messages, diff --git a/tests/integration/openai_responses/__init__.py b/tests/integration/openai_responses/__init__.py deleted file mode 100644 index 756f351d8..000000000 --- a/tests/integration/openai_responses/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. diff --git a/tests/integration/openai_responses/test_basic.py b/tests/integration/openai_responses/test_basic.py deleted file mode 100644 index 49e94388b..000000000 --- a/tests/integration/openai_responses/test_basic.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -import pytest - -from ..test_cases.test_case import TestCase - - -@pytest.mark.parametrize( - "test_case", - [ - "openai:responses:non_streaming_01", - "openai:responses:non_streaming_02", - ], -) -def test_basic_non_streaming(openai_client, client_with_models, text_model_id, test_case): - tc = TestCase(test_case) - question = tc["question"] - expected = tc["expected"] - - response = openai_client.responses.create( - model=text_model_id, - input=question, - stream=False, - ) - output_text = response.output_text.lower().strip() - assert len(output_text) > 0 - assert expected.lower() in output_text - - retrieved_response = openai_client.responses.retrieve(response_id=response.id) - assert retrieved_response.output_text == response.output_text - - next_response = openai_client.responses.create( - model=text_model_id, input="Repeat your previous response in all caps.", previous_response_id=response.id - ) - next_output_text = next_response.output_text.strip() - assert expected.upper() in next_output_text - - -@pytest.mark.parametrize( - "test_case", - [ - "openai:responses:streaming_01", - "openai:responses:streaming_02", - ], -) -def test_basic_streaming(openai_client, client_with_models, text_model_id, test_case): - tc = TestCase(test_case) - question = tc["question"] - expected = tc["expected"] - - response = openai_client.responses.create( - model=text_model_id, - input=question, - stream=True, - timeout=120, # Increase timeout to 2 minutes for large conversation history - ) - streamed_content = [] - response_id = "" - for chunk in response: - response_id = chunk.response.id - streamed_content.append(chunk.response.output_text.strip()) - - assert len(streamed_content) > 0 - assert expected.lower() in "".join(streamed_content).lower() - - retrieved_response = openai_client.responses.retrieve(response_id=response_id) - assert retrieved_response.output_text == "".join(streamed_content) - - next_response = openai_client.responses.create( - model=text_model_id, - input="Repeat your previous response in all caps.", - previous_response_id=response_id, - stream=True, - ) - next_streamed_content = [] - for chunk in next_response: - next_streamed_content.append(chunk.response.output_text.strip()) - assert expected.upper() in "".join(next_streamed_content) diff --git a/tests/integration/openai_responses/test_web_search_builtin.py b/tests/integration/openai_responses/test_web_search_builtin.py deleted file mode 100644 index 5f8a1afd7..000000000 --- a/tests/integration/openai_responses/test_web_search_builtin.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -import pytest - -from ..test_cases.test_case import TestCase - - -@pytest.mark.parametrize( - "test_case", - [ - "openai:responses:tools_web_search_01", - ], -) -def test_web_search_non_streaming(openai_client, client_with_models, text_model_id, test_case): - tc = TestCase(test_case) - input = tc["input"] - expected = tc["expected"] - tools = tc["tools"] - - response = openai_client.responses.create( - model=text_model_id, - input=input, - tools=tools, - stream=False, - ) - assert len(response.output) > 1 - assert response.output[0].type == "web_search_call" - assert response.output[0].status == "completed" - assert response.output[1].type == "message" - assert response.output[1].status == "completed" - assert response.output[1].role == "assistant" - assert len(response.output[1].content) > 0 - assert expected.lower() in response.output_text.lower().strip() - - -def test_input_image_non_streaming(openai_client, vision_model_id): - supported_models = ["llama-4", "gpt-4o", "llama4"] - if not any(model in vision_model_id.lower() for model in supported_models): - pytest.skip(f"Skip for non-supported model: {vision_model_id}") - - response = openai_client.with_options(max_retries=0).responses.create( - model=vision_model_id, - input=[ - { - "role": "user", - "content": [ - { - "type": "input_text", - "text": "Identify the type of animal in this image.", - }, - { - "type": "input_image", - "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg", - }, - ], - } - ], - ) - output_text = response.output_text.lower() - assert "llama" in output_text - - -def test_multi_turn_web_search_from_image_non_streaming(openai_client, vision_model_id): - supported_models = ["llama-4", "gpt-4o", "llama4"] - if not any(model in vision_model_id.lower() for model in supported_models): - pytest.skip(f"Skip for non-supported model: {vision_model_id}") - - response = openai_client.with_options(max_retries=0).responses.create( - model=vision_model_id, - input=[ - { - "role": "user", - "content": [ - { - "type": "input_text", - "text": "Extract a single search keyword that represents the type of animal in this image.", - }, - { - "type": "input_image", - "image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg", - }, - ], - } - ], - ) - output_text = response.output_text.lower() - assert "llama" in output_text - - search_response = openai_client.with_options(max_retries=0).responses.create( - model=vision_model_id, - input="Search the web using the search tool for those keywords plus the words 'maverick' and 'scout' and summarize the results.", - previous_response_id=response.id, - tools=[{"type": "web_search"}], - ) - output_text = search_response.output_text.lower() - assert "model" in output_text diff --git a/tests/verifications/conf/fireworks-llama-stack.yaml b/tests/verifications/conf/fireworks-llama-stack.yaml index fc78a1377..dffd7c739 100644 --- a/tests/verifications/conf/fireworks-llama-stack.yaml +++ b/tests/verifications/conf/fireworks-llama-stack.yaml @@ -13,3 +13,5 @@ test_exclusions: - test_chat_non_streaming_image - test_chat_streaming_image - test_chat_multi_turn_multiple_images + - test_response_non_streaming_image + - test_response_non_streaming_multi_turn_image diff --git a/tests/verifications/conf/groq-llama-stack.yaml b/tests/verifications/conf/groq-llama-stack.yaml index 6958bafc5..786b79c24 100644 --- a/tests/verifications/conf/groq-llama-stack.yaml +++ b/tests/verifications/conf/groq-llama-stack.yaml @@ -13,3 +13,5 @@ test_exclusions: - test_chat_non_streaming_image - test_chat_streaming_image - test_chat_multi_turn_multiple_images + - test_response_non_streaming_image + - test_response_non_streaming_multi_turn_image diff --git a/tests/verifications/conf/together-llama-stack.yaml b/tests/verifications/conf/together-llama-stack.yaml index 719e2d776..58cbcfa93 100644 --- a/tests/verifications/conf/together-llama-stack.yaml +++ b/tests/verifications/conf/together-llama-stack.yaml @@ -13,3 +13,5 @@ test_exclusions: - test_chat_non_streaming_image - test_chat_streaming_image - test_chat_multi_turn_multiple_images + - test_response_non_streaming_image + - test_response_non_streaming_multi_turn_image diff --git a/tests/verifications/generate_report.py b/tests/verifications/generate_report.py index f0894bfce..bdaea3ebf 100755 --- a/tests/verifications/generate_report.py +++ b/tests/verifications/generate_report.py @@ -16,7 +16,7 @@ Description: Configuration: - - Provider details (models, display names) are loaded from `tests/verifications/config.yaml`. + - Provider details (models, display names) are loaded from `tests/verifications/conf/*.yaml`. - Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`. - Test results are stored in `tests/verifications/test_results/`. diff --git a/tests/verifications/openai_api/conftest.py b/tests/verifications/openai_api/conftest.py new file mode 100644 index 000000000..7b4c92f1c --- /dev/null +++ b/tests/verifications/openai_api/conftest.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs + + +def pytest_generate_tests(metafunc): + """Dynamically parametrize tests based on the selected provider and config.""" + if "model" in metafunc.fixturenames: + provider = metafunc.config.getoption("provider") + if not provider: + print("Warning: --provider not specified. Skipping model parametrization.") + metafunc.parametrize("model", []) + return + + try: + config_data = _load_all_verification_configs() + except (FileNotFoundError, IOError) as e: + print(f"ERROR loading verification configs: {e}") + config_data = {"providers": {}} + + provider_config = config_data.get("providers", {}).get(provider) + if provider_config: + models = provider_config.get("models", []) + if models: + metafunc.parametrize("model", models) + else: + print(f"Warning: No models found for provider '{provider}' in config.") + metafunc.parametrize("model", []) # Parametrize empty if no models found + else: + print(f"Warning: Provider '{provider}' not found in config. No models parametrized.") + metafunc.parametrize("model", []) # Parametrize empty if provider not found diff --git a/tests/verifications/openai_api/fixtures/fixtures.py b/tests/verifications/openai_api/fixtures/fixtures.py index 940b99b2a..2ea73cf26 100644 --- a/tests/verifications/openai_api/fixtures/fixtures.py +++ b/tests/verifications/openai_api/fixtures/fixtures.py @@ -5,14 +5,16 @@ # the root directory of this source tree. import os +import re from pathlib import Path import pytest import yaml from openai import OpenAI +# --- Helper Functions --- + -# --- Helper Function to Load Config --- def _load_all_verification_configs(): """Load and aggregate verification configs from the conf/ directory.""" # Note: Path is relative to *this* file (fixtures.py) @@ -44,7 +46,30 @@ def _load_all_verification_configs(): return {"providers": all_provider_configs} -# --- End Helper Function --- +def case_id_generator(case): + """Generate a test ID from the case's 'case_id' field, or use a default.""" + case_id = case.get("case_id") + if isinstance(case_id, (str, int)): + return re.sub(r"\\W|^(?=\\d)", "_", str(case_id)) + return None + + +def should_skip_test(verification_config, provider, model, test_name_base): + """Check if a test should be skipped based on config exclusions.""" + provider_config = verification_config.get("providers", {}).get(provider) + if not provider_config: + return False # No config for provider, don't skip + + exclusions = provider_config.get("test_exclusions", {}).get(model, []) + return test_name_base in exclusions + + +# Helper to get the base test name from the request object +def get_base_test_name(request): + return request.node.originalname + + +# --- End Helper Functions --- @pytest.fixture(scope="session") diff --git a/tests/verifications/openai_api/fixtures/test_cases/response.yaml b/tests/verifications/openai_api/fixtures/test_cases/response.yaml new file mode 100644 index 000000000..f235b2ea8 --- /dev/null +++ b/tests/verifications/openai_api/fixtures/test_cases/response.yaml @@ -0,0 +1,65 @@ +test_response_basic: + test_name: test_response_basic + test_params: + case: + - case_id: "earth" + input: "Which planet do humans live on?" + output: "earth" + - case_id: "saturn" + input: "Which planet has rings around it with a name starting with letter S?" + output: "saturn" + +test_response_multi_turn: + test_name: test_response_multi_turn + test_params: + case: + - case_id: "earth" + turns: + - input: "Which planet do humans live on?" + output: "earth" + - input: "What is the name of the planet from your previous response?" + output: "earth" + +test_response_web_search: + test_name: test_response_web_search + test_params: + case: + - case_id: "llama_experts" + input: "How many experts does the Llama 4 Maverick model have?" + tools: + - type: web_search + search_context_size: "low" + output: "128" + +test_response_image: + test_name: test_response_image + test_params: + case: + - case_id: "llama_image" + input: + - role: user + content: + - type: input_text + text: "Identify the type of animal in this image." + - type: input_image + image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" + output: "llama" + +test_response_multi_turn_image: + test_name: test_response_multi_turn_image + test_params: + case: + - case_id: "llama_image_search" + turns: + - input: + - role: user + content: + - type: input_text + text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'." + - type: input_image + image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" + output: "llama" + - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick' and 'scout'." + tools: + - type: web_search + output: "model" diff --git a/tests/verifications/openai_api/test_chat_completion.py b/tests/verifications/openai_api/test_chat_completion.py index 277eaafa3..64e49d352 100644 --- a/tests/verifications/openai_api/test_chat_completion.py +++ b/tests/verifications/openai_api/test_chat_completion.py @@ -7,7 +7,6 @@ import base64 import copy import json -import re from pathlib import Path from typing import Any @@ -16,7 +15,9 @@ from openai import APIError from pydantic import BaseModel from tests.verifications.openai_api.fixtures.fixtures import ( - _load_all_verification_configs, + case_id_generator, + get_base_test_name, + should_skip_test, ) from tests.verifications.openai_api.fixtures.load import load_test_cases @@ -25,57 +26,6 @@ chat_completion_test_cases = load_test_cases("chat_completion") THIS_DIR = Path(__file__).parent -def case_id_generator(case): - """Generate a test ID from the case's 'case_id' field, or use a default.""" - case_id = case.get("case_id") - if isinstance(case_id, (str, int)): - return re.sub(r"\\W|^(?=\\d)", "_", str(case_id)) - return None - - -def pytest_generate_tests(metafunc): - """Dynamically parametrize tests based on the selected provider and config.""" - if "model" in metafunc.fixturenames: - provider = metafunc.config.getoption("provider") - if not provider: - print("Warning: --provider not specified. Skipping model parametrization.") - metafunc.parametrize("model", []) - return - - try: - config_data = _load_all_verification_configs() - except (FileNotFoundError, IOError) as e: - print(f"ERROR loading verification configs: {e}") - config_data = {"providers": {}} - - provider_config = config_data.get("providers", {}).get(provider) - if provider_config: - models = provider_config.get("models", []) - if models: - metafunc.parametrize("model", models) - else: - print(f"Warning: No models found for provider '{provider}' in config.") - metafunc.parametrize("model", []) # Parametrize empty if no models found - else: - print(f"Warning: Provider '{provider}' not found in config. No models parametrized.") - metafunc.parametrize("model", []) # Parametrize empty if provider not found - - -def should_skip_test(verification_config, provider, model, test_name_base): - """Check if a test should be skipped based on config exclusions.""" - provider_config = verification_config.get("providers", {}).get(provider) - if not provider_config: - return False # No config for provider, don't skip - - exclusions = provider_config.get("test_exclusions", {}).get(model, []) - return test_name_base in exclusions - - -# Helper to get the base test name from the request object -def get_base_test_name(request): - return request.node.originalname - - @pytest.fixture def multi_image_data(): files = [ diff --git a/tests/verifications/openai_api/test_response.py b/tests/verifications/openai_api/test_response.py new file mode 100644 index 000000000..d4bbb526a --- /dev/null +++ b/tests/verifications/openai_api/test_response.py @@ -0,0 +1,166 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +import pytest + +from tests.verifications.openai_api.fixtures.fixtures import ( + case_id_generator, + get_base_test_name, + should_skip_test, +) +from tests.verifications.openai_api.fixtures.load import load_test_cases + +response_test_cases = load_test_cases("response") + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_basic"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_basic(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.responses.create( + model=model, + input=case["input"], + stream=False, + ) + output_text = response.output_text.lower().strip() + assert len(output_text) > 0 + assert case["output"].lower() in output_text + + retrieved_response = openai_client.responses.retrieve(response_id=response.id) + assert retrieved_response.output_text == response.output_text + + next_response = openai_client.responses.create( + model=model, input="Repeat your previous response in all caps.", previous_response_id=response.id + ) + next_output_text = next_response.output_text.strip() + assert case["output"].upper() in next_output_text + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_basic"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_streaming_basic(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.responses.create( + model=model, + input=case["input"], + stream=True, + ) + streamed_content = [] + response_id = "" + for chunk in response: + if chunk.type == "response.completed": + response_id = chunk.response.id + streamed_content.append(chunk.response.output_text.strip()) + + assert len(streamed_content) > 0 + assert case["output"].lower() in "".join(streamed_content).lower() + + retrieved_response = openai_client.responses.retrieve(response_id=response_id) + assert retrieved_response.output_text == "".join(streamed_content) + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_multi_turn"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_multi_turn(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + previous_response_id = None + for turn in case["turns"]: + response = openai_client.responses.create( + model=model, + input=turn["input"], + previous_response_id=previous_response_id, + tools=turn["tools"] if "tools" in turn else None, + ) + previous_response_id = response.id + output_text = response.output_text.lower() + assert turn["output"].lower() in output_text + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_web_search"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_web_search(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.responses.create( + model=model, + input=case["input"], + tools=case["tools"], + stream=False, + ) + assert len(response.output) > 1 + assert response.output[0].type == "web_search_call" + assert response.output[0].status == "completed" + assert response.output[1].type == "message" + assert response.output[1].status == "completed" + assert response.output[1].role == "assistant" + assert len(response.output[1].content) > 0 + assert case["output"].lower() in response.output_text.lower().strip() + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_image"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_image(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.responses.create( + model=model, + input=case["input"], + stream=False, + ) + output_text = response.output_text.lower() + assert case["output"].lower() in output_text + + +@pytest.mark.parametrize( + "case", + response_test_cases["test_response_multi_turn_image"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_multi_turn_image(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + previous_response_id = None + for turn in case["turns"]: + response = openai_client.responses.create( + model=model, + input=turn["input"], + previous_response_id=previous_response_id, + tools=turn["tools"] if "tools" in turn else None, + ) + previous_response_id = response.id + output_text = response.output_text.lower() + assert turn["output"].lower() in output_text