chore(tests): refactor and move responses tests away from verifications (#3068)

This PR kills the verifications infrastructure which is no longer used. It was relocated to the `llama-stack-evals` (https://github.com/meta-llama/llama-stack-evals) repository previously. Responses tests used this infrastructure but that wasn't quite necessary, just a little useful back when @bbrownin introduced the tests. On Discord, we agreed that tests can be moved to our regular integrations test infra. ## Test Plan Some tests currently do fail (although they run!) I will send a follow-up PR which makes them all pass.
2025-12-03 09:53:45 +00:00 · 2025-08-07 13:48:16 -07:00 · 2025-08-07 13:48:16 -07:00 · 5f1ddd35e4
commit 5f1ddd35e4
parent 342550c1e2
36 changed files with 93 additions and 13032 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -53,7 +53,7 @@ jobs:
          # Get test directories dynamically, excluding non-test directories
          # NOTE: we are excluding post_training since the tests take too long
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT

--- a/tests/client-sdk/post_training/test_supervied_fine_tuning.py
+++ b/tests/client-sdk/post_training/test_supervied_fine_tuning.py
@ -1,60 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-POST_TRAINING_PROVIDER_TYPES = ["remote::nvidia"]
-
-
-@pytest.mark.integration
-@pytest.fixture(scope="session")
-def post_training_provider_available(llama_stack_client):
-    providers = llama_stack_client.providers.list()
-    post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
-    return len(post_training_providers) > 0
-
-
-@pytest.mark.integration
-def test_post_training_provider_registration(llama_stack_client, post_training_provider_available):
-    """Check if post_training is in the api list.
-    This is a sanity check to ensure the provider is registered."""
-    if not post_training_provider_available:
-        pytest.skip("post training provider not available")
-
-    providers = llama_stack_client.providers.list()
-    post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
-    assert len(post_training_providers) > 0
-
-
-@pytest.mark.integration
-def test_get_training_jobs(llama_stack_client, post_training_provider_available):
-    """Test listing all training jobs."""
-    if not post_training_provider_available:
-        pytest.skip("post training provider not available")
-
-    jobs = llama_stack_client.post_training.get_training_jobs()
-    assert isinstance(jobs, dict)
-    assert "data" in jobs
-    assert isinstance(jobs["data"], list)
-
-
-@pytest.mark.integration
-def test_get_training_job_status(llama_stack_client, post_training_provider_available):
-    """Test getting status of a specific training job."""
-    if not post_training_provider_available:
-        pytest.skip("post training provider not available")
-
-    jobs = llama_stack_client.post_training.get_training_jobs()
-    if not jobs["data"]:
-        pytest.skip("No training jobs available to check status")
-
-    job_uuid = jobs["data"][0]["job_uuid"]
-    job_status = llama_stack_client.post_training.get_training_job_status(job_uuid=job_uuid)
-
-    assert job_status is not None
-    assert "job_uuid" in job_status
-    assert "status" in job_status
-    assert job_status["job_uuid"] == job_uuid
--- a/tests/integration/non_ci/responses/init.py
+++ b/tests/integration/non_ci/responses/init.py
--- a/tests/integration/non_ci/responses/fixtures/init.py
+++ b/tests/integration/non_ci/responses/fixtures/init.py
--- a/tests/integration/non_ci/responses/fixtures/fixtures.py
+++ b/tests/integration/non_ci/responses/fixtures/fixtures.py
@ -56,16 +56,6 @@ def case_id_generator(case):
    return None


-def should_skip_test(verification_config, provider, model, test_name_base):
-    """Check if a test should be skipped based on config exclusions."""
-    provider_config = verification_config.get("providers", {}).get(provider)
-    if not provider_config:
-        return False  # No config for provider, don't skip
-
-    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
-    return test_name_base in exclusions
-
-
 # Helper to get the base test name from the request object
 def get_base_test_name(request):
    return request.node.originalname
--- a/tests/integration/non_ci/responses/fixtures/images/vision_test_1.jpg
+++ b/tests/integration/non_ci/responses/fixtures/images/vision_test_1.jpg
--- a/tests/integration/non_ci/responses/fixtures/images/vision_test_2.jpg
+++ b/tests/integration/non_ci/responses/fixtures/images/vision_test_2.jpg
--- a/tests/integration/non_ci/responses/fixtures/images/vision_test_3.jpg
+++ b/tests/integration/non_ci/responses/fixtures/images/vision_test_3.jpg
--- a/tests/integration/non_ci/responses/fixtures/load.py
+++ b/tests/integration/non_ci/responses/fixtures/load.py
--- a/tests/integration/non_ci/responses/fixtures/pdfs/llama_stack_and_models.pdf
+++ b/tests/integration/non_ci/responses/fixtures/pdfs/llama_stack_and_models.pdf
--- a/tests/integration/non_ci/responses/fixtures/test_cases/chat_completion.yaml
+++ b/tests/integration/non_ci/responses/fixtures/test_cases/chat_completion.yaml
--- a/tests/integration/non_ci/responses/fixtures/test_cases/responses.yaml
+++ b/tests/integration/non_ci/responses/fixtures/test_cases/responses.yaml
--- a/tests/integration/non_ci/responses/test_responses.py
+++ b/tests/integration/non_ci/responses/test_responses.py
@ -15,12 +15,9 @@ import pytest
 from llama_stack import LlamaStackAsLibraryClient
 from llama_stack.core.datatypes import AuthenticationRequiredError
 from tests.common.mcp import dependency_tools, make_mcp_server
-from tests.verifications.openai_api.fixtures.fixtures import (
-    case_id_generator,
-    get_base_test_name,
-    should_skip_test,
-)
-from tests.verifications.openai_api.fixtures.load import load_test_cases
+
+from .fixtures.fixtures import case_id_generator
+from .fixtures.load import load_test_cases

 responses_test_cases = load_test_cases("responses")

@ -55,13 +52,9 @@ def _upload_file(openai_client, name, file_path):
    responses_test_cases["test_response_basic"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.responses.create(
-        model=model,
+def test_response_non_streaming_basic(request, compat_client, text_model_id, case):
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        stream=False,
    )
@ -69,11 +62,13 @@ def test_response_non_streaming_basic(request, openai_client, model, provider, v
    assert len(output_text) > 0
    assert case["output"].lower() in output_text

-    retrieved_response = openai_client.responses.retrieve(response_id=response.id)
+    retrieved_response = compat_client.responses.retrieve(response_id=response.id)
    assert retrieved_response.output_text == response.output_text

-    next_response = openai_client.responses.create(
-        model=model, input="Repeat your previous response in all caps.", previous_response_id=response.id
+    next_response = compat_client.responses.create(
+        model=text_model_id,
+        input="Repeat your previous response in all caps.",
+        previous_response_id=response.id,
    )
    next_output_text = next_response.output_text.strip()
    assert case["output"].upper() in next_output_text
@ -84,15 +79,11 @@ def test_response_non_streaming_basic(request, openai_client, model, provider, v
    responses_test_cases["test_response_basic"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_streaming_basic(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
+def test_response_streaming_basic(request, compat_client, text_model_id, case):
    import time

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        stream=True,
    )
@ -138,7 +129,7 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
    assert created_index < completed_index, "response.created should come before response.completed"

    # Verify stored response matches streamed response
-    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
+    retrieved_response = compat_client.responses.retrieve(response_id=response_id)
    final_event = events[-1]
    assert retrieved_response.output_text == final_event.response.output_text

@ -148,16 +139,12 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
    responses_test_cases["test_response_basic"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_streaming_incremental_content(request, openai_client, model, provider, verification_config, case):
+def test_response_streaming_incremental_content(request, compat_client, text_model_id, case):
    """Test that streaming actually delivers content incrementally, not just at the end."""
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
    import time

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        stream=True,
    )
@ -241,15 +228,11 @@ def test_response_streaming_incremental_content(request, openai_client, model, p
    responses_test_cases["test_response_multi_turn"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_multi_turn(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
+def test_response_non_streaming_multi_turn(request, compat_client, text_model_id, case):
    previous_response_id = None
    for turn in case["turns"]:
-        response = openai_client.responses.create(
-            model=model,
+        response = compat_client.responses.create(
+            model=text_model_id,
            input=turn["input"],
            previous_response_id=previous_response_id,
            tools=turn["tools"] if "tools" in turn else None,
@ -264,13 +247,9 @@ def test_response_non_streaming_multi_turn(request, openai_client, model, provid
    responses_test_cases["test_response_web_search"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_web_search(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.responses.create(
-        model=model,
+def test_response_non_streaming_web_search(request, compat_client, text_model_id, case):
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        tools=case["tools"],
        stream=False,
@ -290,17 +269,11 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
    responses_test_cases["test_response_file_search"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_file_search(
-    request, openai_client, model, provider, verification_config, tmp_path, case
-):
-    if isinstance(openai_client, LlamaStackAsLibraryClient):
+def test_response_non_streaming_file_search(request, compat_client, text_model_id, tmp_path, case):
+    if isinstance(compat_client, LlamaStackAsLibraryClient):
        pytest.skip("Responses API file search is not yet supported in library client.")

-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    vector_store = _new_vector_store(openai_client, "test_vector_store")
+    vector_store = _new_vector_store(compat_client, "test_vector_store")

    if "file_content" in case:
        file_name = "test_response_non_streaming_file_search.txt"
@ -312,10 +285,10 @@ def test_response_non_streaming_file_search(
    else:
        raise ValueError(f"No file content or path provided for case {case['case_id']}")

-    file_response = _upload_file(openai_client, file_name, file_path)
+    file_response = _upload_file(compat_client, file_name, file_path)

    # Attach our file to the vector store
-    file_attach_response = openai_client.vector_stores.files.create(
+    file_attach_response = compat_client.vector_stores.files.create(
        vector_store_id=vector_store.id,
        file_id=file_response.id,
    )
@ -323,7 +296,7 @@ def test_response_non_streaming_file_search(
    # Wait for the file to be attached
    while file_attach_response.status == "in_progress":
        time.sleep(0.1)
-        file_attach_response = openai_client.vector_stores.files.retrieve(
+        file_attach_response = compat_client.vector_stores.files.retrieve(
            vector_store_id=vector_store.id,
            file_id=file_response.id,
        )
@ -337,8 +310,8 @@ def test_response_non_streaming_file_search(
            tool["vector_store_ids"] = [vector_store.id]

    # Create the response request, which should query our vector store
-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        tools=tools,
        stream=False,
@ -358,21 +331,15 @@ def test_response_non_streaming_file_search(
    assert case["output"].lower() in response.output_text.lower().strip()


-def test_response_non_streaming_file_search_empty_vector_store(
-    request, openai_client, model, provider, verification_config
-):
-    if isinstance(openai_client, LlamaStackAsLibraryClient):
+def test_response_non_streaming_file_search_empty_vector_store(request, compat_client, text_model_id):
+    if isinstance(compat_client, LlamaStackAsLibraryClient):
        pytest.skip("Responses API file search is not yet supported in library client.")

-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    vector_store = _new_vector_store(openai_client, "test_vector_store")
+    vector_store = _new_vector_store(compat_client, "test_vector_store")

    # Create the response request, which should query our vector store
-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="How many experts does the Llama 4 Maverick model have?",
        tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
        stream=False,
@ -395,19 +362,15 @@ def test_response_non_streaming_file_search_empty_vector_store(
    responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_mcp_tool(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
+def test_response_non_streaming_mcp_tool(request, compat_client, text_model_id, case):
    with make_mcp_server() as mcp_server_info:
        tools = case["tools"]
        for tool in tools:
            if tool["type"] == "mcp":
                tool["server_url"] = mcp_server_info["server_url"]

-        response = openai_client.responses.create(
-            model=model,
+        response = compat_client.responses.create(
+            model=text_model_id,
            input=case["input"],
            tools=tools,
            stream=False,
@ -418,7 +381,7 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
        assert list_tools.type == "mcp_list_tools"
        assert list_tools.server_label == "localmcp"
        assert len(list_tools.tools) == 2
-        assert {t["name"] for t in list_tools.tools} == {"get_boiling_point", "greet_everyone"}
+        assert {t.name for t in list_tools.tools} == {"get_boiling_point", "greet_everyone"}

        call = response.output[1]
        assert call.type == "mcp_call"
@ -440,12 +403,12 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider

        exc_type = (
            AuthenticationRequiredError
-            if isinstance(openai_client, LlamaStackAsLibraryClient)
+            if isinstance(compat_client, LlamaStackAsLibraryClient)
            else (httpx.HTTPStatusError, openai.AuthenticationError)
        )
        with pytest.raises(exc_type):
-            openai_client.responses.create(
-                model=model,
+            compat_client.responses.create(
+                model=text_model_id,
                input=case["input"],
                tools=tools,
                stream=False,
@ -456,8 +419,8 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
                tool["server_url"] = mcp_server_info["server_url"]
                tool["headers"] = {"Authorization": "Bearer test-token"}

-        response = openai_client.responses.create(
-            model=model,
+        response = compat_client.responses.create(
+            model=text_model_id,
            input=case["input"],
            tools=tools,
            stream=False,
@ -470,13 +433,9 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
    responses_test_cases["test_response_custom_tool"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_custom_tool(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.responses.create(
-        model=model,
+def test_response_non_streaming_custom_tool(request, compat_client, text_model_id, case):
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        tools=case["tools"],
        stream=False,
@ -492,13 +451,9 @@ def test_response_non_streaming_custom_tool(request, openai_client, model, provi
    responses_test_cases["test_response_image"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_image(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.responses.create(
-        model=model,
+def test_response_non_streaming_image(request, compat_client, text_model_id, case):
+    response = compat_client.responses.create(
+        model=text_model_id,
        input=case["input"],
        stream=False,
    )
@ -511,15 +466,11 @@ def test_response_non_streaming_image(request, openai_client, model, provider, v
    responses_test_cases["test_response_multi_turn_image"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_multi_turn_image(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
+def test_response_non_streaming_multi_turn_image(request, compat_client, text_model_id, case):
    previous_response_id = None
    for turn in case["turns"]:
-        response = openai_client.responses.create(
-            model=model,
+        response = compat_client.responses.create(
+            model=text_model_id,
            input=turn["input"],
            previous_response_id=previous_response_id,
            tools=turn["tools"] if "tools" in turn else None,
@ -534,14 +485,8 @@ def test_response_non_streaming_multi_turn_image(request, openai_client, model,
    responses_test_cases["test_response_multi_turn_tool_execution"]["test_params"]["case"],
    ids=case_id_generator,
 )
-def test_response_non_streaming_multi_turn_tool_execution(
-    request, openai_client, model, provider, verification_config, case
-):
+def test_response_non_streaming_multi_turn_tool_execution(request, compat_client, text_model_id, case):
    """Test multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
        tools = case["tools"]
        # Replace the placeholder URL with the actual server URL
@ -549,14 +494,15 @@ def test_response_non_streaming_multi_turn_tool_execution(
            if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
                tool["server_url"] = mcp_server_info["server_url"]

-        response = openai_client.responses.create(
+        response = compat_client.responses.create(
            input=case["input"],
-            model=model,
+            model=text_model_id,
            tools=tools,
        )

        # Verify we have MCP tool calls in the output
        mcp_list_tools = [output for output in response.output if output.type == "mcp_list_tools"]
+
        mcp_calls = [output for output in response.output if output.type == "mcp_call"]
        message_outputs = [output for output in response.output if output.type == "message"]

@ -571,7 +517,7 @@ def test_response_non_streaming_multi_turn_tool_execution(
            "get_experiment_id",
            "get_experiment_results",
        }
-        assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
+        assert {t.name for t in mcp_list_tools[0].tools} == expected_tool_names

        assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
        for mcp_call in mcp_calls:
@ -595,14 +541,8 @@ def test_response_non_streaming_multi_turn_tool_execution(
    responses_test_cases["test_response_multi_turn_tool_execution_streaming"]["test_params"]["case"],
    ids=case_id_generator,
 )
-async def test_response_streaming_multi_turn_tool_execution(
-    request, openai_client, model, provider, verification_config, case
-):
+async def test_response_streaming_multi_turn_tool_execution(request, compat_client, text_model_id, case):
    """Test streaming multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
        tools = case["tools"]
        # Replace the placeholder URL with the actual server URL
@ -610,15 +550,15 @@ async def test_response_streaming_multi_turn_tool_execution(
            if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
                tool["server_url"] = mcp_server_info["server_url"]

-        stream = openai_client.responses.create(
+        stream = compat_client.responses.create(
            input=case["input"],
-            model=model,
+            model=text_model_id,
            tools=tools,
            stream=True,
        )

        chunks = []
-        async for chunk in stream:
+        for chunk in stream:
            chunks.append(chunk)

        # Should have at least response.created and response.completed
@ -653,7 +593,7 @@ async def test_response_streaming_multi_turn_tool_execution(
                "get_experiment_id",
                "get_experiment_results",
            }
-            assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
+            assert {t.name for t in mcp_list_tools[0].tools} == expected_tool_names

            # Should have at least 1 MCP call (the model should call at least one tool)
            assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
@ -694,17 +634,13 @@ async def test_response_streaming_multi_turn_tool_execution(
        },
    ],
 )
-def test_response_text_format(request, openai_client, model, provider, verification_config, text_format):
-    if isinstance(openai_client, LlamaStackAsLibraryClient):
+def test_response_text_format(request, compat_client, text_model_id, text_format):
+    if isinstance(compat_client, LlamaStackAsLibraryClient):
        pytest.skip("Responses API text format is not yet supported in library client.")

-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
    stream = False
-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="What is the capital of France?",
        stream=stream,
        text={"format": text_format},
@ -717,16 +653,12 @@ def test_response_text_format(request, openai_client, model, provider, verificat


@pytest.fixture
-def vector_store_with_filtered_files(request, openai_client, model, provider, verification_config, tmp_path_factory):
+def vector_store_with_filtered_files(request, compat_client, text_model_id, tmp_path_factory):
    """Create a vector store with multiple files that have different attributes for filtering tests."""
-    if isinstance(openai_client, LlamaStackAsLibraryClient):
+    if isinstance(compat_client, LlamaStackAsLibraryClient):
        pytest.skip("Responses API file search is not yet supported in library client.")

-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    vector_store = _new_vector_store(openai_client, "test_vector_store_with_filters")
+    vector_store = _new_vector_store(compat_client, "test_vector_store_with_filters")
    tmp_path = tmp_path_factory.mktemp("filter_test_files")

    # Create multiple files with different attributes
@ -776,18 +708,18 @@ def vector_store_with_filtered_files(request, openai_client, model, provider, ve
        file_path.write_text(file_data["content"])

        # Upload file
-        file_response = _upload_file(openai_client, file_data["name"], str(file_path))
+        file_response = _upload_file(compat_client, file_data["name"], str(file_path))
        file_ids.append(file_response.id)

        # Attach file to vector store with attributes
-        file_attach_response = openai_client.vector_stores.files.create(
+        file_attach_response = compat_client.vector_stores.files.create(
            vector_store_id=vector_store.id, file_id=file_response.id, attributes=file_data["attributes"]
        )

        # Wait for attachment
        while file_attach_response.status == "in_progress":
            time.sleep(0.1)
-            file_attach_response = openai_client.vector_stores.files.retrieve(
+            file_attach_response = compat_client.vector_stores.files.retrieve(
                vector_store_id=vector_store.id,
                file_id=file_response.id,
            )
@ -797,17 +729,17 @@ def vector_store_with_filtered_files(request, openai_client, model, provider, ve

    # Cleanup: delete vector store and files
    try:
-        openai_client.vector_stores.delete(vector_store_id=vector_store.id)
+        compat_client.vector_stores.delete(vector_store_id=vector_store.id)
        for file_id in file_ids:
            try:
-                openai_client.files.delete(file_id=file_id)
+                compat_client.files.delete(file_id=file_id)
            except Exception:
                pass  # File might already be deleted
    except Exception:
        pass  # Best effort cleanup


-def test_response_file_search_filter_by_region(openai_client, model, vector_store_with_filtered_files):
+def test_response_file_search_filter_by_region(compat_client, text_model_id, vector_store_with_filtered_files):
    """Test file search with region equality filter."""
    tools = [
        {
@ -817,8 +749,8 @@ def test_response_file_search_filter_by_region(openai_client, model, vector_stor
        }
    ]

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="What are the updates from the US region?",
        tools=tools,
        stream=False,
@ -838,7 +770,7 @@ def test_response_file_search_filter_by_region(openai_client, model, vector_stor
        assert "asia" not in result.text.lower()


-def test_response_file_search_filter_by_category(openai_client, model, vector_store_with_filtered_files):
+def test_response_file_search_filter_by_category(compat_client, text_model_id, vector_store_with_filtered_files):
    """Test file search with category equality filter."""
    tools = [
        {
@ -848,8 +780,8 @@ def test_response_file_search_filter_by_category(openai_client, model, vector_st
        }
    ]

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="Show me all marketing reports",
        tools=tools,
        stream=False,
@ -868,7 +800,7 @@ def test_response_file_search_filter_by_category(openai_client, model, vector_st
        assert "revenue figures" not in result.text.lower()


-def test_response_file_search_filter_by_date_range(openai_client, model, vector_store_with_filtered_files):
+def test_response_file_search_filter_by_date_range(compat_client, text_model_id, vector_store_with_filtered_files):
    """Test file search with date range filter using compound AND."""
    tools = [
        {
@ -892,8 +824,8 @@ def test_response_file_search_filter_by_date_range(openai_client, model, vector_
        }
    ]

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="What happened in Q1 2023?",
        tools=tools,
        stream=False,
@ -911,7 +843,7 @@ def test_response_file_search_filter_by_date_range(openai_client, model, vector_
        assert "q3" not in result.text.lower()


-def test_response_file_search_filter_compound_and(openai_client, model, vector_store_with_filtered_files):
+def test_response_file_search_filter_compound_and(compat_client, text_model_id, vector_store_with_filtered_files):
    """Test file search with compound AND filter (region AND category)."""
    tools = [
        {
@ -927,8 +859,8 @@ def test_response_file_search_filter_compound_and(openai_client, model, vector_s
        }
    ]

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="What are the engineering updates from the US?",
        tools=tools,
        stream=False,
@ -947,7 +879,7 @@ def test_response_file_search_filter_compound_and(openai_client, model, vector_s
        assert "promotional" not in result.text.lower() and "revenue" not in result.text.lower()


-def test_response_file_search_filter_compound_or(openai_client, model, vector_store_with_filtered_files):
+def test_response_file_search_filter_compound_or(compat_client, text_model_id, vector_store_with_filtered_files):
    """Test file search with compound OR filter (marketing OR sales)."""
    tools = [
        {
@ -963,8 +895,8 @@ def test_response_file_search_filter_compound_or(openai_client, model, vector_st
        }
    ]

-    response = openai_client.responses.create(
-        model=model,
+    response = compat_client.responses.create(
+        model=text_model_id,
        input="Show me marketing and sales documents",
        tools=tools,
        stream=False,
--- a/tests/verifications/README.md
+++ b/tests/verifications/README.md
@ -1,79 +0,0 @@
-# Llama Stack Verifications
-
-Llama Stack Verifications provide standardized test suites to ensure API compatibility and behavior consistency across different LLM providers. These tests help verify that different models and providers implement the expected interfaces and behaviors correctly.
-
-## Overview
-
-This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
-
-## Features
-
-The verification suite currently tests the following in both streaming and non-streaming modes:
-
- Basic chat completions
- Image input capabilities
- Structured JSON output formatting
- Tool calling functionality
-
-## Report
-
-The lastest report can be found at [REPORT.md](REPORT.md).
-
-To update the report, ensure you have the API keys set,
-```bash
-export OPENAI_API_KEY=<your_openai_api_key>
-export FIREWORKS_API_KEY=<your_fireworks_api_key>
-export TOGETHER_API_KEY=<your_together_api_key>
-```
-then run
-```bash
-uv run python tests/verifications/generate_report.py --run-tests
-```
-
-## Running Tests
-
-To run the verification tests, use pytest with the following parameters:
-
-```bash
-cd llama-stack
-pytest tests/verifications/openai_api --provider=<provider-name>
-```
-
-Example:
-```bash
-# Run all tests
-pytest tests/verifications/openai_api --provider=together
-
-# Only run tests with Llama 4 models
-pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
-```
-
-### Parameters
-
- `--provider`: The provider name (openai, fireworks, together, groq, cerebras, etc.)
- `--base-url`: The base URL for the provider's API (optional - defaults to the standard URL for the specified provider)
- `--api-key`: Your API key for the provider (optional - defaults to the standard API_KEY name for the specified provider)
-
-## Supported Providers
-
-The verification suite supports any provider with an OpenAI compatible endpoint.
-
-See `tests/verifications/conf/` for the list of supported providers.
-
-To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.
-
-## Adding New Test Cases
-
-To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.
-
-
-## Structure
-
- `__init__.py` - Marks the directory as a Python package
- `conf/` - Provider-specific configuration files
- `openai_api/` - Tests specific to OpenAI-compatible APIs
-  - `fixtures/` - Test fixtures and utilities
-    - `fixtures.py` - Provider-specific fixtures
-    - `load.py` - Utilities for loading test cases
-    - `test_cases/` - JSON test case definitions
-  - `test_chat_completion.py` - Tests for chat completion APIs
--- a/tests/verifications/REPORT.md
+++ b/tests/verifications/REPORT.md
@ -1,232 +0,0 @@
-# Test Results Report
-
-*Generated on: 2025-04-17 12:42:33*
-
-*This report was generated by running `python tests/verifications/generate_report.py`*
-
-## Legend
-
- ✅ - Test passed
- ❌ - Test failed
- ⚪ - Test not applicable or not run for this model
-
-
-## Summary
-
-| Provider | Pass Rate | Tests Passed | Total Tests |
-| --- | --- | --- | --- |
-| Meta_reference | 100.0% | 28 | 28 |
-| Together | 50.0% | 40 | 80 |
-| Fireworks | 50.0% | 40 | 80 |
-| Openai | 100.0% | 56 | 56 |
-
-
-
-## Meta_reference
-
-*Tests run on: 2025-04-17 12:37:11*
-
-```bash
-# Run all tests for this provider:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=meta_reference -v
-
-# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=meta_reference -k "test_chat_multi_turn_multiple_images and stream=False"
-```
-
-
-**Model Key (Meta_reference)**
-
-| Display Name | Full Model ID |
-| --- | --- |
-| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
-
-
-| Test | Llama-4-Scout-Instruct |
-| --- | --- |
-| test_chat_multi_turn_multiple_images (stream=False) | ✅ |
-| test_chat_multi_turn_multiple_images (stream=True) | ✅ |
-| test_chat_non_streaming_basic (earth) | ✅ |
-| test_chat_non_streaming_basic (saturn) | ✅ |
-| test_chat_non_streaming_image | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ |
-| test_chat_non_streaming_structured_output (calendar) | ✅ |
-| test_chat_non_streaming_structured_output (math) | ✅ |
-| test_chat_non_streaming_tool_calling | ✅ |
-| test_chat_non_streaming_tool_choice_none | ✅ |
-| test_chat_non_streaming_tool_choice_required | ✅ |
-| test_chat_streaming_basic (earth) | ✅ |
-| test_chat_streaming_basic (saturn) | ✅ |
-| test_chat_streaming_image | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ |
-| test_chat_streaming_structured_output (calendar) | ✅ |
-| test_chat_streaming_structured_output (math) | ✅ |
-| test_chat_streaming_tool_calling | ✅ |
-| test_chat_streaming_tool_choice_none | ✅ |
-| test_chat_streaming_tool_choice_required | ✅ |
-
-## Together
-
-*Tests run on: 2025-04-17 12:27:45*
-
-```bash
-# Run all tests for this provider:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
-
-# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_multi_turn_multiple_images and stream=False"
-```
-
-
-**Model Key (Together)**
-
-| Display Name | Full Model ID |
-| --- | --- |
-| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
-| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
-| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
-
-
-| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
-| --- | --- | --- | --- |
-| test_chat_multi_turn_multiple_images (stream=False) | ⚪ | ✅ | ✅ |
-| test_chat_multi_turn_multiple_images (stream=True) | ⚪ | ❌ | ❌ |
-| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_choice_none | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_tool_choice_required | ✅ | ✅ | ✅ |
-| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
-| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
-| test_chat_streaming_image | ⚪ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
-| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
-| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
-| test_chat_streaming_tool_calling | ✅ | ❌ | ❌ |
-| test_chat_streaming_tool_choice_none | ❌ | ❌ | ❌ |
-| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
-
-## Fireworks
-
-*Tests run on: 2025-04-17 12:29:53*
-
-```bash
-# Run all tests for this provider:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
-
-# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_multi_turn_multiple_images and stream=False"
-```
-
-
-**Model Key (Fireworks)**
-
-| Display Name | Full Model ID |
-| --- | --- |
-| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
-| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
-| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
-
-
-| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
-| --- | --- | --- | --- |
-| test_chat_multi_turn_multiple_images (stream=False) | ⚪ | ✅ | ✅ |
-| test_chat_multi_turn_multiple_images (stream=True) | ⚪ | ✅ | ✅ |
-| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
-| test_chat_non_streaming_tool_choice_none | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_choice_required | ✅ | ❌ | ❌ |
-| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
-| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
-| test_chat_streaming_image | ⚪ | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
-| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
-| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
-| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
-| test_chat_streaming_tool_calling | ❌ | ❌ | ❌ |
-| test_chat_streaming_tool_choice_none | ✅ | ✅ | ✅ |
-| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
-
-## Openai
-
-*Tests run on: 2025-04-17 12:34:08*
-
-```bash
-# Run all tests for this provider:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
-
-# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
-pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_multi_turn_multiple_images and stream=False"
-```
-
-
-**Model Key (Openai)**
-
-| Display Name | Full Model ID |
-| --- | --- |
-| gpt-4o | `gpt-4o` |
-| gpt-4o-mini | `gpt-4o-mini` |
-
-
-| Test | gpt-4o | gpt-4o-mini |
-| --- | --- | --- |
-| test_chat_multi_turn_multiple_images (stream=False) | ✅ | ✅ |
-| test_chat_multi_turn_multiple_images (stream=True) | ✅ | ✅ |
-| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
-| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
-| test_chat_non_streaming_image | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling | ✅ | ✅ |
-| test_chat_non_streaming_tool_choice_none | ✅ | ✅ |
-| test_chat_non_streaming_tool_choice_required | ✅ | ✅ |
-| test_chat_streaming_basic (earth) | ✅ | ✅ |
-| test_chat_streaming_basic (saturn) | ✅ | ✅ |
-| test_chat_streaming_image | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
-| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
-| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
-| test_chat_streaming_structured_output (math) | ✅ | ✅ |
-| test_chat_streaming_tool_calling | ✅ | ✅ |
-| test_chat_streaming_tool_choice_none | ✅ | ✅ |
-| test_chat_streaming_tool_choice_required | ✅ | ✅ |
--- a/tests/verifications/conf/cerebras.yaml
+++ b/tests/verifications/conf/cerebras.yaml
@ -1,11 +0,0 @@
-base_url: https://api.cerebras.ai/v1
-api_key_var: CEREBRAS_API_KEY
-models:
- llama-3.3-70b
-model_display_names:
-  llama-3.3-70b: Llama-3.3-70B-Instruct
-test_exclusions:
-  llama-3.3-70b:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
--- a/tests/verifications/conf/fireworks-llama-stack.yaml
+++ b/tests/verifications/conf/fireworks-llama-stack.yaml
@ -1,17 +0,0 @@
-base_url: http://localhost:8321/v1/openai/v1
-api_key_var: FIREWORKS_API_KEY
-models:
- fireworks/llama-v3p3-70b-instruct
- fireworks/llama4-scout-instruct-basic
- fireworks/llama4-maverick-instruct-basic
-model_display_names:
-  fireworks/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
-  fireworks/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
-  fireworks/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
-test_exclusions:
-  fireworks/llama-v3p3-70b-instruct:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
-  - test_response_non_streaming_image
-  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/conf/fireworks.yaml
+++ b/tests/verifications/conf/fireworks.yaml
@ -1,15 +0,0 @@
-base_url: https://api.fireworks.ai/inference/v1
-api_key_var: FIREWORKS_API_KEY
-models:
- accounts/fireworks/models/llama-v3p3-70b-instruct
- accounts/fireworks/models/llama4-scout-instruct-basic
- accounts/fireworks/models/llama4-maverick-instruct-basic
-model_display_names:
-  accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
-  accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
-  accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
-test_exclusions:
-  accounts/fireworks/models/llama-v3p3-70b-instruct:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
--- a/tests/verifications/conf/groq-llama-stack.yaml
+++ b/tests/verifications/conf/groq-llama-stack.yaml
@ -1,17 +0,0 @@
-base_url: http://localhost:8321/v1/openai/v1
-api_key_var: GROQ_API_KEY
-models:
- groq/llama-3.3-70b-versatile
- groq/llama-4-scout-17b-16e-instruct
- groq/llama-4-maverick-17b-128e-instruct
-model_display_names:
-  groq/llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
-  groq/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
-  groq/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
-test_exclusions:
-  groq/llama-3.3-70b-versatile:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
-  - test_response_non_streaming_image
-  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/conf/groq.yaml
+++ b/tests/verifications/conf/groq.yaml
@ -1,15 +0,0 @@
-base_url: https://api.groq.com/openai/v1
-api_key_var: GROQ_API_KEY
-models:
- llama-3.3-70b-versatile
- meta-llama/llama-4-scout-17b-16e-instruct
- meta-llama/llama-4-maverick-17b-128e-instruct
-model_display_names:
-  llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
-  meta-llama/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
-  meta-llama/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
-test_exclusions:
-  llama-3.3-70b-versatile:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
--- a/tests/verifications/conf/meta_reference.yaml
+++ b/tests/verifications/conf/meta_reference.yaml
@ -1,8 +0,0 @@
-# LLAMA_STACK_PORT=5002 llama stack run meta-reference-gpu --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct --env INFERENCE_CHECKPOINT_DIR=<path_to_ckpt>
-base_url: http://localhost:5002/v1/openai/v1
-api_key_var: foo
-models:
- meta-llama/Llama-4-Scout-17B-16E-Instruct
-model_display_names:
-  meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
-test_exclusions: {}
--- a/tests/verifications/conf/openai-llama-stack.yaml
+++ b/tests/verifications/conf/openai-llama-stack.yaml
@ -1,9 +0,0 @@
-base_url: http://localhost:8321/v1/openai/v1
-api_key_var: OPENAI_API_KEY
-models:
- openai/gpt-4o
- openai/gpt-4o-mini
-model_display_names:
-  openai/gpt-4o: gpt-4o
-  openai/gpt-4o-mini: gpt-4o-mini
-test_exclusions: {}
--- a/tests/verifications/conf/openai.yaml
+++ b/tests/verifications/conf/openai.yaml
@ -1,9 +0,0 @@
-base_url: https://api.openai.com/v1
-api_key_var: OPENAI_API_KEY
-models:
- gpt-4o
- gpt-4o-mini
-model_display_names:
-  gpt-4o: gpt-4o
-  gpt-4o-mini: gpt-4o-mini
-test_exclusions: {}
--- a/tests/verifications/conf/together-llama-stack.yaml
+++ b/tests/verifications/conf/together-llama-stack.yaml
@ -1,17 +0,0 @@
-base_url: http://localhost:8321/v1/openai/v1
-api_key_var: TOGETHER_API_KEY
-models:
- together/meta-llama/Llama-3.3-70B-Instruct-Turbo
- together/meta-llama/Llama-4-Scout-17B-16E-Instruct
- together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-model_display_names:
-  together/meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
-  together/meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
-  together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
-test_exclusions:
-  together/meta-llama/Llama-3.3-70B-Instruct-Turbo:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
-  - test_response_non_streaming_image
-  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/conf/together.yaml
+++ b/tests/verifications/conf/together.yaml
@ -1,15 +0,0 @@
-base_url: https://api.together.xyz/v1
-api_key_var: TOGETHER_API_KEY
-models:
- meta-llama/Llama-3.3-70B-Instruct-Turbo
- meta-llama/Llama-4-Scout-17B-16E-Instruct
- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-model_display_names:
-  meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
-  meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
-  meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
-test_exclusions:
-  meta-llama/Llama-3.3-70B-Instruct-Turbo:
-  - test_chat_non_streaming_image
-  - test_chat_streaming_image
-  - test_chat_multi_turn_multiple_images
--- a/tests/verifications/conftest.py
+++ b/tests/verifications/conftest.py
@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import re
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--base-url",
-        action="store",
-        help="Base URL for OpenAI compatible API",
-    )
-    parser.addoption(
-        "--api-key",
-        action="store",
-        help="API key to use for the provider",
-    )
-    parser.addoption(
-        "--provider",
-        action="store",
-        help="Provider to use for testing",
-    )
-    parser.addoption(
-        "--model",
-        action="store",
-        help="Model to use for testing",
-    )
-
-
-pytest_plugins = [
-    "pytest_jsonreport",
-    "tests.verifications.openai_api.fixtures.fixtures",
-    "tests.verifications.openai_api.fixtures.load",
-]
-
-
-@pytest.hookimpl(optionalhook=True)
-def pytest_json_runtest_metadata(item, call):
-    """Add model and case_id to pytest-json report metadata."""
-    metadata = {}
-    nodeid = item.nodeid
-
-    # 1. Extract model from callspec if available
-    model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
-    if model:
-        metadata["model"] = model
-    else:
-        # Fallback: Try parsing from nodeid (less reliable)
-        match_model = re.search(r"\[(.*?)-", nodeid)
-        if match_model:
-            model = match_model.group(1)  # Store model even if found via fallback
-            metadata["model"] = model
-        else:
-            print(f"Warning: Could not determine model for test {nodeid}")
-            model = None  # Ensure model is None if not found
-
-    # 2. Extract case_id using the known model string if possible
-    if model:
-        # Construct a regex pattern to find the case_id *after* the model name and a hyphen.
-        # Escape the model name in case it contains regex special characters.
-        pattern = re.escape(model) + r"-(.*?)\]$"
-        match_case = re.search(pattern, nodeid)
-        if match_case:
-            case_id = match_case.group(1)
-            metadata["case_id"] = case_id
-        else:
-            # Fallback if the pattern didn't match (e.g., nodeid format unexpected)
-            # Try the old less specific regex as a last resort.
-            match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
-            if match_case_fallback:
-                case_id = match_case_fallback.group(1)
-                metadata["case_id"] = case_id
-                print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
-            else:
-                print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
-                if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
-                    metadata["case_id"] = "parsing_failed"
-    elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
-        # Cannot reliably parse case_id without model, but we know it's a case test.
-        # Try the generic fallback regex.
-        match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
-        if match_case_fallback:
-            case_id = match_case_fallback.group(1)
-            metadata["case_id"] = case_id
-            print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
-        else:
-            print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
-            metadata["case_id"] = "parsing_failed_no_model"
-    # else: Not a test with a model or case param we need to handle.
-
-    return metadata
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@ -1,502 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-"""
-Test Report Generator
-
-Description:
-    This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
-    for different providers, aggregates the results from JSON reports, and generates
-    a markdown summary report (REPORT.md).
-
-    It automatically cleans up old test result files, keeping only the latest
-    per provider.
-
-
-Configuration:
-    - Provider details (models, display names) are loaded from `tests/verifications/conf/*.yaml`.
-    - Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
-    - Test results are stored in `tests/verifications/test_results/`.
-
-Usage:
-    # Generate a report using the latest existing test results
-    python tests/verifications/generate_report.py
-
-    # Run tests for all configured providers and generate a report
-    python tests/verifications/generate_report.py --run-tests
-
-    # Run tests only for specific providers (space-separated)
-    python tests/verifications/generate_report.py --run-tests --providers fireworks openai
-
-    # Run tests matching a keyword expression (uses pytest -k)
-    python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
-
-    # Run a specific test case for a provider
-    python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
-
-    # Save the report to a custom location
-    python tests/verifications/generate_report.py --output custom_report.md
-"""
-
-import argparse
-import json
-import os
-import re
-import subprocess
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-
-from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
-
-# Define the root directory for test results
-RESULTS_DIR = Path(__file__).parent / "test_results"
-RESULTS_DIR.mkdir(exist_ok=True)
-
-# Maximum number of test result files to keep per provider
-MAX_RESULTS_PER_PROVIDER = 1
-
-DEFAULT_PROVIDERS = [
-    "meta_reference",
-    "together",
-    "fireworks",
-    "openai",
-]
-
-VERIFICATION_CONFIG = _load_all_verification_configs()
-
-
-def run_tests(provider, keyword=None):
-    """Run pytest for a specific provider and save results"""
-    print(f"Running tests for provider: {provider}")
-
-    timestamp = int(time.time())
-    # Use a constant filename for the final result and temp file
-    result_file = RESULTS_DIR / f"{provider}.json"
-    temp_json_file = RESULTS_DIR / f"temp_{provider}.json"
-
-    # Determine project root directory relative to this script
-    project_root = Path(__file__).parent.parent.parent
-
-    # Run pytest with JSON output
-    cmd = [
-        "python",
-        "-m",
-        "pytest",
-        "tests/verifications/openai_api/test_chat_completion.py",
-        f"--provider={provider}",
-        "-v",
-        "--json-report",
-        f"--json-report-file={temp_json_file}",
-    ]
-
-    # Append -k argument if provided
-    if keyword:
-        cmd.extend(["-k", keyword])
-
-    try:
-        # Run subprocess with cwd set to project root
-        result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
-        print(f"Pytest exit code: {result.returncode}")
-
-        # Check if the JSON file was created
-        if temp_json_file.exists():
-            with open(temp_json_file) as f:
-                test_results = json.load(f)
-
-            test_results["run_timestamp"] = timestamp
-
-            # Save results to the final (overwritten) file
-            with open(result_file, "w") as f:
-                json.dump(test_results, f, indent=2)
-                f.write("\n")  # Add a trailing newline for precommit
-
-            # Clean up temp file
-            temp_json_file.unlink()
-
-            print(f"Test results saved to {result_file}")
-            return result_file
-        else:
-            print(f"Error: JSON report file not created for {provider}")
-            print(f"Command stdout: {result.stdout}")
-            print(f"Command stderr: {result.stderr}")
-            return None
-    except Exception as e:
-        print(f"Error running tests for {provider}: {e}")
-        return None
-
-
-def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
-    """Runs tests for a list of providers."""
-    print(f"Running tests for providers: {', '.join(providers_to_run)}")
-    for provider in providers_to_run:
-        run_tests(provider.strip(), keyword=keyword)
-    print("Finished running tests.")
-
-
-def parse_results(
-    result_file,
-) -> tuple[defaultdict[str, defaultdict[str, dict[str, bool]]], defaultdict[str, set[str]], set[str], str]:
-    """Parse a single test results file.
-
-    Returns:
-        Tuple containing:
-        - parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
-        - providers_in_file: DefaultDict[provider, Set[model]] found in this file.
-        - tests_in_file: Set[test_name] found in this file.
-        - run_timestamp: Timestamp when the test was run
-    """
-    if not os.path.exists(result_file):
-        print(f"Results file does not exist: {result_file}")
-        # Return empty defaultdicts/set matching the type hint
-        return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
-
-    with open(result_file) as f:
-        results = json.load(f)
-
-    # Initialize results dictionary with specific types
-    parsed_results: defaultdict[str, defaultdict[str, dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
-    providers_in_file: defaultdict[str, set[str]] = defaultdict(set)
-    tests_in_file: set[str] = set()
-    # Extract provider from filename (e.g., "openai.json" -> "openai")
-    provider: str = result_file.stem
-
-    # Extract run timestamp from the JSON data
-    run_timestamp_unix = results.get("run_timestamp")
-    run_timestamp_str = (
-        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_timestamp_unix))
-        if run_timestamp_unix is not None
-        else "Unknown"
-    )
-
-    # Debug: Print summary of test results
-    print(f"Test results summary for {provider}:")
-    print(f"Total tests: {results.get('summary', {}).get('total', 0)}")
-    print(f"Passed: {results.get('summary', {}).get('passed', 0)}")
-    print(f"Failed: {results.get('summary', {}).get('failed', 0)}")
-    print(f"Error: {results.get('summary', {}).get('error', 0)}")
-    print(f"Skipped: {results.get('summary', {}).get('skipped', 0)}")
-
-    # Extract test results
-    if "tests" not in results or not results["tests"]:
-        print(f"No test results found in {result_file}")
-        # Return empty defaultdicts/set matching the type hint
-        return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
-
-    # Process the tests
-    for test in results["tests"]:
-        test_id = test.get("nodeid", "")
-
-        if not (call_phase := test.get("call")):
-            continue
-        call_outcome = call_phase.get("outcome")
-        if call_outcome not in ("passed", "failed"):
-            continue
-
-        # --- Extract data from metadata ---
-        metadata = test.get("metadata", {})
-        model = metadata.get("model")
-        case_id = metadata.get("case_id")  # String ID (if provided)
-        case_index = metadata.get("case_index")  # Integer index (if no ID provided)
-
-        # Check if we have a model and at least one case identifier
-        if not model or (case_id is None and case_index is None):
-            print(
-                f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
-            )
-            continue
-
-        try:
-            test_name_base = test_id.split("::")[1].split("[")[0]
-        except (IndexError, ValueError) as e:
-            print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
-            continue
-
-        # Construct detailed test name using ID or index
-        if case_id is not None:
-            detailed_test_name = f"{test_name_base} ({case_id})"
-        elif case_index == 0:
-            # If case_id is missing and index is 0, assume single case, use base name only
-            detailed_test_name = test_name_base
-        elif case_index is not None:  # case_index > 0
-            # Use case_index for naming if case_id wasn't provided and index > 0
-            detailed_test_name = f"{test_name_base} (case{case_index})"
-        else:
-            # This case should be prevented by the earlier check, but handle defensively
-            print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
-            continue
-
-        # Populate collections for this file
-        tests_in_file.add(detailed_test_name)
-        providers_in_file[provider].add(model)
-
-        if call_outcome == "passed":
-            parsed_results[provider][model][detailed_test_name] = True
-        elif call_outcome == "failed":
-            parsed_results[provider][model][detailed_test_name] = False
-
-    # Final Summary Warning (Optional)
-    if not parsed_results.get(provider):
-        print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")
-
-    return parsed_results, providers_in_file, tests_in_file, run_timestamp_str
-
-
-def generate_report(
-    results_dict: dict[str, Any],
-    providers: dict[str, set[str]],
-    all_tests: set[str],
-    provider_timestamps: dict[str, str],
-    output_file=None,
-):
-    """Generate the markdown report.
-
-    Args:
-        results_dict: Aggregated results [provider][model][test_name] -> status.
-        providers: Dict of all providers and their models {provider: {models}}.
-                   The order of keys in this dict determines the report order.
-        all_tests: Set of all test names found.
-        provider_timestamps: Dict of provider to timestamp when tests were run
-        output_file: Optional path to save the report.
-    """
-    if output_file is None:
-        # Default to creating the report in the same directory as this script
-        output_file = Path(__file__).parent / "REPORT.md"
-    else:
-        output_file = Path(output_file)
-
-    # Convert provider model sets to sorted lists (use passed-in providers dict)
-    providers_sorted = {prov: sorted(models) for prov, models in providers.items()}
-
-    # Sort tests alphabetically (use passed-in all_tests set)
-    sorted_tests = sorted(all_tests)
-
-    # Calculate counts for each base test name
-    base_test_case_counts: defaultdict[str, int] = defaultdict(int)
-    base_test_name_map: dict[str, str] = {}
-    for test_name in sorted_tests:
-        match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
-        if match:
-            base_name = match.group(1).strip()
-            base_test_case_counts[base_name] += 1
-            base_test_name_map[test_name] = base_name
-        else:
-            # Should not happen with current naming, but handle defensively
-            base_test_case_counts[test_name] += 1
-            base_test_name_map[test_name] = test_name
-
-    if not sorted_tests:
-        print("Warning: No test results found to generate a report.")
-        # Optionally create an empty report or return early
-        with open(output_file, "w") as f:
-            f.write("# Test Results Report\n\nNo test results found.\n")
-        print(f"Generated empty report: {output_file}")
-        return
-
-    report = ["# Test Results Report\n"]
-    report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
-    report.append("*This report was generated by running `python tests/verifications/generate_report.py`*\n")
-
-    # Icons for pass/fail
-    pass_icon = "✅"
-    fail_icon = "❌"
-    na_icon = "⚪"
-
-    # Add emoji legend
-    report.append("## Legend\n")
-    report.append(f"- {pass_icon} - Test passed")
-    report.append(f"- {fail_icon} - Test failed")
-    report.append(f"- {na_icon} - Test not applicable or not run for this model")
-    report.append("\n")
-
-    # Add a summary section
-    report.append("## Summary\n")
-
-    # Count total tests and passes (use passed-in providers and all_tests)
-    total_tests = 0
-    passed_tests = 0
-    provider_totals = {}
-    for provider, models in providers_sorted.items():
-        provider_passed = 0
-        provider_total = 0
-        if provider in results_dict:
-            for model in models:
-                if model in results_dict[provider]:
-                    model_results = results_dict[provider][model]
-                    for test in sorted_tests:
-                        if test in model_results:
-                            provider_total += 1
-                            total_tests += 1
-                            if model_results[test]:
-                                provider_passed += 1
-                                passed_tests += 1
-        provider_totals[provider] = (provider_passed, provider_total)
-
-    # Add summary table (use the order from the providers dict keys)
-    report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
-    report.append("| --- | --- | --- | --- |")
-    # Iterate through providers in the order they appear in the input dict
-    for provider in providers_sorted.keys():
-        passed, total = provider_totals.get(provider, (0, 0))
-        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
-        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
-    report.append("\n")
-
-    for provider in providers_sorted.keys():
-        provider_models = providers_sorted[provider]  # Use sorted models
-        if not provider_models:
-            continue
-
-        report.append(f"\n## {provider.capitalize()}\n")
-
-        # Add timestamp when test was run
-        if provider in provider_timestamps:
-            report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
-
-        # Add test command for reproducing results
-        test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
-        report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")
-
-        # Find an example test with a case ID
-        example_base_test_name = None
-        example_case_id = None
-        # Get first test as fallback base, handle empty list
-        first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"
-
-        match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
-        if match:
-            example_base_test_name = match.group(1).strip()
-            example_case_id = match.group(2).strip()
-        else:
-            example_base_test_name = first_test_name
-
-        base_name = base_test_name_map.get(first_test_name, first_test_name)  # Get base name
-        case_count = base_test_case_counts.get(base_name, 1)  # Get count
-        filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
-
-        test_cmd_specific_case = (
-            f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
-        )
-        report.append(
-            f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
-        )
-
-        # Get display names (use passed-in providers dict)
-        provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
-        display_name_map = provider_config.get("model_display_names", {})
-
-        # Add Model Key Table (use provider_models)
-        report.append(f"\n**Model Key ({provider.capitalize()})**\n")
-        provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
-        for model_id in provider_models:
-            display_name = display_name_map.get(model_id, model_id)
-            provider_key_lines.append(f"| {display_name} | `{model_id}` |")
-        report.extend(provider_key_lines)
-        report.append("\n")
-
-        # Create results table header (use provider_models)
-        display_names = [display_name_map.get(m, m) for m in provider_models]
-        header = "| Test | " + " | ".join(display_names) + " |"
-        separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
-        report.append(header)
-        report.append(separator)
-
-        # Get results for this provider from results_dict
-        provider_results_data = results_dict.get(provider, {})
-
-        # Add rows for each test (use sorted_tests)
-        for test in sorted_tests:
-            # Determine display name based on case count
-            base_name = base_test_name_map.get(test, test)  # Get base name
-            case_count = base_test_case_counts.get(base_name, 1)  # Get count
-            display_test_name = base_name if case_count == 1 else test  # Choose display name
-            row = f"| {display_test_name} |"  # Use display name
-
-            for model_id in provider_models:
-                if model_id in provider_results_data and test in provider_results_data[model_id]:
-                    result = pass_icon if provider_results_data[model_id][test] else fail_icon
-                else:
-                    result = na_icon
-                row += f" {result} |"
-            report.append(row)
-
-    # Write to file
-    with open(output_file, "w") as f:
-        f.write("\n".join(report))
-        f.write("\n")
-
-    print(f"Report generated: {output_file}")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Generate test report")
-    parser.add_argument("--run-tests", action="store_true", help="Run tests before generating report")
-    parser.add_argument(
-        "--providers",
-        type=str,
-        nargs="+",
-        help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
-    )
-    parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
-    parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
-    args = parser.parse_args()
-
-    all_results = {}
-    final_providers_order = {}  # Dictionary to store results, preserving processing order
-    aggregated_tests = set()
-    provider_timestamps = {}
-
-    # 1. Determine the desired list and order of providers
-    if args.providers:
-        desired_providers = []
-        for provider_arg in args.providers:
-            desired_providers.extend([p.strip() for p in provider_arg.split(",")])
-    else:
-        desired_providers = DEFAULT_PROVIDERS  # Use default order/list
-
-    # 2. Run tests if requested (using the desired provider list)
-    if args.run_tests:
-        run_multiple_tests(desired_providers, args.k)
-
-    for provider in desired_providers:
-        # Construct the expected result file path directly
-        result_file = RESULTS_DIR / f"{provider}.json"
-
-        if result_file.exists():  # Check if the specific file exists
-            print(f"Loading results for {provider} from {result_file}")
-            try:
-                parsed_data = parse_results(result_file)
-                parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
-                all_results.update(parsed_results)
-                aggregated_tests.update(tests_in_file)
-
-                # Add models for this provider, ensuring it's added in the correct report order
-                if provider in providers_in_file:
-                    if provider not in final_providers_order:
-                        final_providers_order[provider] = set()
-                    final_providers_order[provider].update(providers_in_file[provider])
-                    if run_timestamp != "Unknown":
-                        provider_timestamps[provider] = run_timestamp
-                else:
-                    print(
-                        f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
-                    )
-
-            except Exception as e:
-                print(f"Error parsing results for provider {provider} from {result_file}: {e}")
-        else:
-            # Only print warning if we expected results (i.e., provider was in the desired list)
-            print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
-
-    # 5. Generate the report using the filtered & ordered results
-    print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
-    generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/verifications/openai-api-verification-run.yaml
+++ b/tests/verifications/openai-api-verification-run.yaml
@ -1,162 +0,0 @@
-# This is a temporary run file because model names used by the verification tests
-# are not quite consistent with various pre-existing distributions.
-#
-version: '2'
-image_name: openai-api-verification
-apis:
- agents
- inference
- telemetry
- tool_runtime
- vector_io
- safety
-providers:
-  inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:}
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY:}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:}
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      url: https://api.openai.com/v1
-      api_key: ${env.OPENAI_API_KEY:}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/faiss_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai-api-verification}/trace_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/responses_store.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/registry.db
-models:
- metadata: {}
-  model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  model_type: llm
- metadata: {}
-  model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
- metadata: {}
-  model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  provider_id: together
-  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
- metadata: {}
-  model_id: fireworks/llama-v3p3-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: fireworks/llama4-scout-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: fireworks/llama4-maverick-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-3.3-70b-versatile
-  provider_id: groq
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-4-scout-17b-16e-instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-4-maverick-17b-128e-instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  model_type: llm
- metadata: {}
-  model_id: openai/gpt-4o
-  provider_id: openai
-  provider_model_id: openai/gpt-4o
-  model_type: llm
- metadata: {}
-  model_id: openai/gpt-4o-mini
-  provider_id: openai
-  provider_model_id: openai/gpt-4o-mini
-  model_type: llm
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
-server:
-  port: 8321
--- a/tests/verifications/openai_api/init.py
+++ b/tests/verifications/openai_api/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/tests/verifications/openai_api/conftest.py
+++ b/tests/verifications/openai_api/conftest.py
@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
-
-
-def pytest_generate_tests(metafunc):
-    """Dynamically parametrize tests based on the selected provider and config."""
-    if "model" in metafunc.fixturenames:
-        model = metafunc.config.getoption("model")
-        if model:
-            metafunc.parametrize("model", [model])
-            return
-
-        provider = metafunc.config.getoption("provider")
-        if not provider:
-            print("Warning: --provider not specified. Skipping model parametrization.")
-            metafunc.parametrize("model", [])
-            return
-
-        try:
-            config_data = _load_all_verification_configs()
-        except (OSError, FileNotFoundError) as e:
-            print(f"ERROR loading verification configs: {e}")
-            config_data = {"providers": {}}
-
-        provider_config = config_data.get("providers", {}).get(provider)
-        if provider_config:
-            models = provider_config.get("models", [])
-            if models:
-                metafunc.parametrize("model", models)
-            else:
-                print(f"Warning: No models found for provider '{provider}' in config.")
-                metafunc.parametrize("model", [])  # Parametrize empty if no models found
-        else:
-            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
-            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
--- a/tests/verifications/openai_api/fixtures/init.py
+++ b/tests/verifications/openai_api/fixtures/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/tests/verifications/openai_api/test_chat_completion.py
+++ b/tests/verifications/openai_api/test_chat_completion.py
@ -1,717 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import copy
-import json
-from pathlib import Path
-from typing import Any
-
-import pytest
-from openai import APIError
-from pydantic import BaseModel
-
-from tests.verifications.openai_api.fixtures.fixtures import (
-    case_id_generator,
-    get_base_test_name,
-    should_skip_test,
-)
-from tests.verifications.openai_api.fixtures.load import load_test_cases
-
-chat_completion_test_cases = load_test_cases("chat_completion")
-
-THIS_DIR = Path(__file__).parent
-
-
-@pytest.fixture
-def multi_image_data():
-    files = [
-        THIS_DIR / "fixtures/images/vision_test_1.jpg",
-        THIS_DIR / "fixtures/images/vision_test_2.jpg",
-        THIS_DIR / "fixtures/images/vision_test_3.jpg",
-    ]
-    encoded_files = []
-    for file in files:
-        with open(file, "rb") as image_file:
-            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
-            encoded_files.append(f"data:image/jpeg;base64,{base64_data}")
-    return encoded_files
-
-
-# --- Test Functions ---
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        stream=False,
-    )
-    assert response.choices[0].message.role == "assistant"
-    assert case["output"].lower() in response.choices[0].message.content.lower()
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        stream=True,
-    )
-    content = ""
-    for chunk in response:
-        content += chunk.choices[0].delta.content or ""
-
-    # TODO: add detailed type validation
-
-    assert case["output"].lower() in content.lower()
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_input_validation"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_error_handling(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    with pytest.raises(APIError) as e:
-        openai_client.chat.completions.create(
-            model=model,
-            messages=case["input"]["messages"],
-            stream=False,
-            tool_choice=case["input"]["tool_choice"] if "tool_choice" in case["input"] else None,
-            tools=case["input"]["tools"] if "tools" in case["input"] else None,
-        )
-    assert case["output"]["error"]["status_code"] == e.value.status_code
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_input_validation"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_streaming_error_handling(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    with pytest.raises(APIError) as e:
-        response = openai_client.chat.completions.create(
-            model=model,
-            messages=case["input"]["messages"],
-            stream=True,
-            tool_choice=case["input"]["tool_choice"] if "tool_choice" in case["input"] else None,
-            tools=case["input"]["tools"] if "tools" in case["input"] else None,
-        )
-        for _chunk in response:
-            pass
-    assert str(case["output"]["error"]["status_code"]) in e.value.message
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        stream=False,
-    )
-    assert response.choices[0].message.role == "assistant"
-    assert case["output"].lower() in response.choices[0].message.content.lower()
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        stream=True,
-    )
-    content = ""
-    for chunk in response:
-        content += chunk.choices[0].delta.content or ""
-
-    # TODO: add detailed type validation
-
-    assert case["output"].lower() in content.lower()
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        response_format=case["input"]["response_format"],
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    maybe_json_content = response.choices[0].message.content
-
-    validate_structured_output(maybe_json_content, case["output"])
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        response_format=case["input"]["response_format"],
-        stream=True,
-    )
-    maybe_json_content = ""
-    for chunk in response:
-        maybe_json_content += chunk.choices[0].delta.content or ""
-    validate_structured_output(maybe_json_content, case["output"])
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    assert len(response.choices[0].message.tool_calls) > 0
-    assert case["output"] == "get_weather_tool_call"
-    assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
-    # TODO: add detailed type validation
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
-    ids=case_id_generator,
-)
-def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    stream = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        stream=True,
-    )
-
-    _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)
-    assert len(tool_calls_buffer) == 1
-    for call in tool_calls_buffer:
-        assert len(call["id"]) > 0
-        function = call["function"]
-        assert function["name"] == "get_weather"
-
-        args_dict = json.loads(function["arguments"])
-        assert "san francisco" in args_dict["location"].lower()
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],  # Reusing existing case for now
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        tool_choice="required",  # Force tool call
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    assert len(response.choices[0].message.tool_calls) > 0, "Expected tool call when tool_choice='required'"
-    expected_tool_name = case["input"]["tools"][0]["function"]["name"]
-    assert response.choices[0].message.tool_calls[0].function.name == expected_tool_name
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],  # Reusing existing case for now
-    ids=case_id_generator,
-)
-def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    stream = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        tool_choice="required",  # Force tool call
-        stream=True,
-    )
-
-    _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)
-
-    assert len(tool_calls_buffer) > 0, "Expected tool call when tool_choice='required'"
-    expected_tool_name = case["input"]["tools"][0]["function"]["name"]
-    assert any(call["function"]["name"] == expected_tool_name for call in tool_calls_buffer), (
-        f"Expected tool call '{expected_tool_name}' not found in stream"
-    )
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],  # Reusing existing case for now
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        tool_choice="none",
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    assert response.choices[0].message.tool_calls is None, "Expected no tool calls when tool_choice='none'"
-    assert response.choices[0].message.content is not None, "Expected content when tool_choice='none'"
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],  # Reusing existing case for now
-    ids=case_id_generator,
-)
-def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    stream = openai_client.chat.completions.create(
-        model=model,
-        messages=case["input"]["messages"],
-        tools=case["input"]["tools"],
-        tool_choice="none",
-        stream=True,
-    )
-
-    content = ""
-    for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.content:
-            content += delta.content
-        assert not delta.tool_calls, "Expected no tool call chunks when tool_choice='none'"
-
-    assert len(content) > 0, "Expected content when tool_choice='none'"
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []),
-    ids=case_id_generator,
-)
-def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):
-    """
-    Test cases for multi-turn tool calling.
-    Tool calls are asserted.
-    Tool responses are provided in the test case.
-    Final response is asserted.
-    """
-
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    # Create a copy of the messages list to avoid modifying the original
-    messages = []
-    tools = case["input"]["tools"]
-    # Use deepcopy to prevent modification across runs/parametrization
-    expected_results = copy.deepcopy(case["expected"])
-    tool_responses = copy.deepcopy(case.get("tool_responses", []))
-    input_messages_turns = copy.deepcopy(case["input"]["messages"])
-
-    # keep going until either
-    # 1. we have messages to test in multi-turn
-    # 2. no messages but last message is tool response
-    while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
-        # do not take new messages if last message is tool response
-        if len(messages) == 0 or messages[-1]["role"] != "tool":
-            new_messages = input_messages_turns.pop(0)
-            # Ensure new_messages is a list of message objects
-            if isinstance(new_messages, list):
-                messages.extend(new_messages)
-            else:
-                # If it's a single message object, add it directly
-                messages.append(new_messages)
-
-        # --- API Call ---
-        response = openai_client.chat.completions.create(
-            model=model,
-            messages=messages,
-            tools=tools,
-            stream=False,
-        )
-
-        # --- Process Response ---
-        assistant_message = response.choices[0].message
-        messages.append(assistant_message.model_dump(exclude_unset=True))
-
-        assert assistant_message.role == "assistant"
-
-        # Get the expected result data
-        expected = expected_results.pop(0)
-        num_tool_calls = expected["num_tool_calls"]
-
-        # --- Assertions based on expected result ---
-        assert len(assistant_message.tool_calls or []) == num_tool_calls, (
-            f"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}"
-        )
-
-        if num_tool_calls > 0:
-            tool_call = assistant_message.tool_calls[0]
-            assert tool_call.function.name == expected["tool_name"], (
-                f"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'"
-            )
-            # Parse the JSON string arguments before comparing
-            actual_arguments = json.loads(tool_call.function.arguments)
-            assert actual_arguments == expected["tool_arguments"], (
-                f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'"
-            )
-
-            # Prepare and append the tool response for the next turn
-            tool_response = tool_responses.pop(0)
-            messages.append(
-                {
-                    "role": "tool",
-                    "tool_call_id": tool_call.id,
-                    "content": tool_response["response"],
-                }
-            )
-        else:
-            assert assistant_message.content is not None, "Expected content, but none received."
-            expected_answers = expected["answer"]  # This is now a list
-            content_lower = assistant_message.content.lower()
-            assert any(ans.lower() in content_lower for ans in expected_answers), (
-                f"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'"
-            )
-
-
-@pytest.mark.parametrize(
-    "case",
-    chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []),
-    ids=case_id_generator,
-)
-def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):
-    """ """
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    messages = []
-    tools = case["input"]["tools"]
-    expected_results = copy.deepcopy(case["expected"])
-    tool_responses = copy.deepcopy(case.get("tool_responses", []))
-    input_messages_turns = copy.deepcopy(case["input"]["messages"])
-
-    while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
-        if len(messages) == 0 or messages[-1]["role"] != "tool":
-            new_messages = input_messages_turns.pop(0)
-            if isinstance(new_messages, list):
-                messages.extend(new_messages)
-            else:
-                messages.append(new_messages)
-
-        # --- API Call (Streaming) ---
-        stream = openai_client.chat.completions.create(
-            model=model,
-            messages=messages,
-            tools=tools,
-            stream=True,
-        )
-
-        # --- Process Stream ---
-        accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)
-
-        # --- Construct Assistant Message for History ---
-        assistant_message_dict = {"role": "assistant"}
-        if accumulated_content:
-            assistant_message_dict["content"] = accumulated_content
-        if accumulated_tool_calls:
-            assistant_message_dict["tool_calls"] = accumulated_tool_calls
-
-        messages.append(assistant_message_dict)
-
-        # --- Assertions ---
-        expected = expected_results.pop(0)
-        num_tool_calls = expected["num_tool_calls"]
-
-        assert len(accumulated_tool_calls or []) == num_tool_calls, (
-            f"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}"
-        )
-
-        if num_tool_calls > 0:
-            # Use the first accumulated tool call for assertion
-            tool_call = accumulated_tool_calls[0]
-            assert tool_call["function"]["name"] == expected["tool_name"], (
-                f"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'"
-            )
-            # Parse the accumulated arguments string for comparison
-            actual_arguments = json.loads(tool_call["function"]["arguments"])
-            assert actual_arguments == expected["tool_arguments"], (
-                f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'"
-            )
-
-            # Prepare and append the tool response for the next turn
-            tool_response = tool_responses.pop(0)
-            messages.append(
-                {
-                    "role": "tool",
-                    "tool_call_id": tool_call["id"],
-                    "content": tool_response["response"],
-                }
-            )
-        else:
-            assert accumulated_content is not None and accumulated_content != "", "Expected content, but none received."
-            expected_answers = expected["answer"]
-            content_lower = accumulated_content.lower()
-            assert any(ans.lower() in content_lower for ans in expected_answers), (
-                f"Expected one of {expected_answers} in content, but got: '{accumulated_content}'"
-            )
-
-
-@pytest.mark.parametrize("stream", [False, True], ids=["stream=False", "stream=True"])
-def test_chat_multi_turn_multiple_images(
-    request, openai_client, model, provider, verification_config, multi_image_data, stream
-):
-    test_name_base = get_base_test_name(request)
-    if should_skip_test(verification_config, provider, model, test_name_base):
-        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
-
-    messages_turn1 = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": multi_image_data[0],
-                    },
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": multi_image_data[1],
-                    },
-                },
-                {
-                    "type": "text",
-                    "text": "What furniture is in the first image that is not in the second image?",
-                },
-            ],
-        },
-    ]
-
-    # First API call
-    response1 = openai_client.chat.completions.create(
-        model=model,
-        messages=messages_turn1,
-        stream=stream,
-    )
-    if stream:
-        message_content1 = ""
-        for chunk in response1:
-            message_content1 += chunk.choices[0].delta.content or ""
-    else:
-        message_content1 = response1.choices[0].message.content
-    assert len(message_content1) > 0
-    assert any(expected in message_content1.lower().strip() for expected in {"chair", "table"}), message_content1
-
-    # Prepare messages for the second turn
-    messages_turn2 = messages_turn1 + [
-        {"role": "assistant", "content": message_content1},
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": multi_image_data[2],
-                    },
-                },
-                {"type": "text", "text": "What is in this image that is also in the first image?"},
-            ],
-        },
-    ]
-
-    # Second API call
-    response2 = openai_client.chat.completions.create(
-        model=model,
-        messages=messages_turn2,
-        stream=stream,
-    )
-    if stream:
-        message_content2 = ""
-        for chunk in response2:
-            message_content2 += chunk.choices[0].delta.content or ""
-    else:
-        message_content2 = response2.choices[0].message.content
-    assert len(message_content2) > 0
-    assert any(expected in message_content2.lower().strip() for expected in {"bed"}), message_content2
-
-
-# --- Helper functions (structured output validation) ---
-
-
-def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
-    if schema_name == "valid_calendar_event":
-
-        class CalendarEvent(BaseModel):
-            name: str
-            date: str
-            participants: list[str]
-
-        try:
-            calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
-            return calendar_event
-        except Exception:
-            return None
-    elif schema_name == "valid_math_reasoning":
-
-        class Step(BaseModel):
-            explanation: str
-            output: str
-
-        class MathReasoning(BaseModel):
-            steps: list[Step]
-            final_answer: str
-
-        try:
-            math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
-            return math_reasoning
-        except Exception:
-            return None
-
-    return None
-
-
-def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
-    structured_output = get_structured_output(maybe_json_content, schema_name)
-    assert structured_output is not None
-    if schema_name == "valid_calendar_event":
-        assert structured_output.name is not None
-        assert structured_output.date is not None
-        assert len(structured_output.participants) == 2
-    elif schema_name == "valid_math_reasoning":
-        assert len(structured_output.final_answer) > 0
-
-
-def _accumulate_streaming_tool_calls(stream):
-    """Accumulates tool calls and content from a streaming ChatCompletion response."""
-    tool_calls_buffer = {}
-    current_id = None
-    full_content = ""  # Initialize content accumulator
-    # Process streaming chunks
-    for chunk in stream:
-        choice = chunk.choices[0]
-        delta = choice.delta
-
-        # Accumulate content
-        if delta.content:
-            full_content += delta.content
-
-        if delta.tool_calls is None:
-            continue
-
-        for tool_call_delta in delta.tool_calls:
-            if tool_call_delta.id:
-                current_id = tool_call_delta.id
-            call_id = current_id
-            # Skip if no ID seen yet for this tool call delta
-            if not call_id:
-                continue
-            func_delta = tool_call_delta.function
-
-            if call_id not in tool_calls_buffer:
-                tool_calls_buffer[call_id] = {
-                    "id": call_id,
-                    "type": "function",  # Assume function type
-                    "function": {"name": None, "arguments": ""},  # Nested structure
-                }
-
-            # Accumulate name and arguments into the nested function dict
-            if func_delta:
-                if func_delta.name:
-                    tool_calls_buffer[call_id]["function"]["name"] = func_delta.name
-                if func_delta.arguments:
-                    tool_calls_buffer[call_id]["function"]["arguments"] += func_delta.arguments
-
-    # Return content and tool calls as a list
-    return full_content, list(tool_calls_buffer.values())
--- a/tests/verifications/test_results/fireworks.json
+++ b/tests/verifications/test_results/fireworks.json
--- a/tests/verifications/test_results/meta_reference.json
+++ b/tests/verifications/test_results/meta_reference.json
--- a/tests/verifications/test_results/openai.json
+++ b/tests/verifications/test_results/openai.json
--- a/tests/verifications/test_results/together.json
+++ b/tests/verifications/test_results/together.json