Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-28 01:01:59 +00:00 · 2025-04-11 19:28:02 -05:00 · 2025-04-11 19:28:02 -05:00 · 172a918fe3
commit 172a918fe3
parent 13c660f5a5 51492bd9b6
66 changed files with 9320 additions and 9446 deletions
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+from openai import OpenAI
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+from ..test_cases.test_case import TestCase
+
+
+def provider_from_model(client_with_models, model_id):
+    models = {m.identifier: m for m in client_with_models.models.list()}
+    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
+    provider_id = models[model_id].provider_id
+    providers = {p.provider_id: p for p in client_with_models.providers.list()}
+    return providers[provider_id]
+
+
+def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI completions are not supported when testing with library client yet.")
+
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "inline::meta-reference",
+        "inline::sentence-transformers",
+        "inline::vllm",
+        "remote::bedrock",
+        "remote::cerebras",
+        "remote::databricks",
+        # Technically Nvidia does support OpenAI completions, but none of their hosted models
+        # support both completions and chat completions endpoint and all the Llama models are
+        # just chat completions
+        "remote::nvidia",
+        "remote::runpod",
+        "remote::sambanova",
+        "remote::tgi",
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
+
+
+def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
+
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "inline::meta-reference",
+        "inline::sentence-transformers",
+        "inline::vllm",
+        "remote::bedrock",
+        "remote::cerebras",
+        "remote::databricks",
+        "remote::runpod",
+        "remote::sambanova",
+        "remote::tgi",
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
+
+
+def skip_if_provider_isnt_vllm(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type != "remote::vllm":
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
+
+
+@pytest.fixture
+def openai_client(client_with_models):
+    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+    return OpenAI(base_url=base_url, api_key="bar")
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:sanity",
+    ],
+)
+def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    # ollama needs more verbose prompting for some reason here...
+    prompt = "Respond to this question and explain your answer. " + tc["content"]
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=prompt,
+        stream=False,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert len(choice.text) > 10
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:sanity",
+    ],
+)
+def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    # ollama needs more verbose prompting for some reason here...
+    prompt = "Respond to this question and explain your answer. " + tc["content"]
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=prompt,
+        stream=True,
+        max_tokens=50,
+    )
+    streamed_content = [chunk.choices[0].text for chunk in response]
+    content_str = "".join(streamed_content).lower().strip()
+    assert len(content_str) > 10
+
+
+@pytest.mark.parametrize(
+    "prompt_logprobs",
+    [
+        1,
+        0,
+    ],
+)
+def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
+    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
+
+    prompt = "Hello, world!"
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=prompt,
+        stream=False,
+        extra_body={
+            "prompt_logprobs": prompt_logprobs,
+        },
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert len(choice.prompt_logprobs) > 0
+
+
+def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
+    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
+
+    prompt = "I am feeling really sad today."
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=prompt,
+        stream=False,
+        extra_body={
+            "guided_choice": ["joy", "sadness"],
+        },
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert choice.text in ["joy", "sadness"]
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:non_streaming_01",
+        "inference:chat_completion:non_streaming_02",
+    ],
+)
+def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=[
+            {
+                "role": "user",
+                "content": question,
+            }
+        ],
+        stream=False,
+    )
+    message_content = response.choices[0].message.content.lower().strip()
+    assert len(message_content) > 0
+    assert expected.lower() in message_content
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:streaming_01",
+        "inference:chat_completion:streaming_02",
+    ],
+)
+def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=[{"role": "user", "content": question}],
+        stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
+    )
+    streamed_content = []
+    for chunk in response:
+        if chunk.choices[0].delta.content:
+            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
+    assert len(streamed_content) > 0
+    assert expected.lower() in "".join(streamed_content)
--- a/tests/unit/providers/nvidia/test_safety.py
+++ b/tests/unit/providers/nvidia/test_safety.py
@ -0,0 +1,326 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import os
+import unittest
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from llama_stack.apis.inference.inference import CompletionMessage, UserMessage
+from llama_stack.apis.safety import RunShieldResponse, ViolationLevel
+from llama_stack.apis.shields import Shield
+from llama_stack.providers.remote.safety.nvidia.config import NVIDIASafetyConfig
+from llama_stack.providers.remote.safety.nvidia.nvidia import NVIDIASafetyAdapter
+
+
+class TestNVIDIASafetyAdapter(unittest.TestCase):
+    def setUp(self):
+        os.environ["NVIDIA_GUARDRAILS_URL"] = "http://nemo.test"
+
+        # Initialize the adapter
+        self.config = NVIDIASafetyConfig(
+            guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
+        )
+        self.adapter = NVIDIASafetyAdapter(config=self.config)
+        self.shield_store = AsyncMock()
+        self.adapter.shield_store = self.shield_store
+
+        # Mock the HTTP request methods
+        self.guardrails_post_patcher = patch(
+            "llama_stack.providers.remote.safety.nvidia.nvidia.NeMoGuardrails._guardrails_post"
+        )
+        self.mock_guardrails_post = self.guardrails_post_patcher.start()
+        self.mock_guardrails_post.return_value = {"status": "allowed"}
+
+    def tearDown(self):
+        """Clean up after each test."""
+        self.guardrails_post_patcher.stop()
+
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, run_async):
+        self.run_async = run_async
+
+    def _assert_request(
+        self,
+        mock_call: MagicMock,
+        expected_url: str,
+        expected_headers: dict[str, str] | None = None,
+        expected_json: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Helper method to verify request details in mock API calls.
+
+        Args:
+            mock_call: The MagicMock object that was called
+            expected_url: The expected URL to which the request was made
+            expected_headers: Optional dictionary of expected request headers
+            expected_json: Optional dictionary of expected JSON payload
+        """
+        call_args = mock_call.call_args
+
+        # Check URL
+        assert call_args[0][0] == expected_url
+
+        # Check headers if provided
+        if expected_headers:
+            for key, value in expected_headers.items():
+                assert call_args[1]["headers"][key] == value
+
+        # Check JSON if provided
+        if expected_json:
+            for key, value in expected_json.items():
+                if isinstance(value, dict):
+                    for nested_key, nested_value in value.items():
+                        assert call_args[1]["json"][key][nested_key] == nested_value
+                else:
+                    assert call_args[1]["json"][key] == value
+
+    def test_register_shield_with_valid_id(self):
+        shield = Shield(
+            provider_id="nvidia",
+            type="shield",
+            identifier="test-shield",
+            provider_resource_id="test-model",
+        )
+
+        # Register the shield
+        self.run_async(self.adapter.register_shield(shield))
+
+    def test_register_shield_without_id(self):
+        shield = Shield(
+            provider_id="nvidia",
+            type="shield",
+            identifier="test-shield",
+            provider_resource_id="",
+        )
+
+        # Register the shield should raise a ValueError
+        with self.assertRaises(ValueError):
+            self.run_async(self.adapter.register_shield(shield))
+
+    def test_run_shield_allowed(self):
+        # Set up the shield
+        shield_id = "test-shield"
+        shield = Shield(
+            provider_id="nvidia",
+            type="shield",
+            identifier=shield_id,
+            provider_resource_id="test-model",
+        )
+        self.shield_store.get_shield.return_value = shield
+
+        # Mock Guardrails API response
+        self.mock_guardrails_post.return_value = {"status": "allowed"}
+
+        # Run the shield
+        messages = [
+            UserMessage(role="user", content="Hello, how are you?"),
+            CompletionMessage(
+                role="assistant",
+                content="I'm doing well, thank you for asking!",
+                stop_reason="end_of_message",
+                tool_calls=[],
+            ),
+        ]
+        result = self.run_async(self.adapter.run_shield(shield_id, messages))
+
+        # Verify the shield store was called
+        self.shield_store.get_shield.assert_called_once_with(shield_id)
+
+        # Verify the Guardrails API was called correctly
+        self.mock_guardrails_post.assert_called_once_with(
+            path="/v1/guardrail/checks",
+            data={
+                "model": shield_id,
+                "messages": [
+                    json.loads(messages[0].model_dump_json()),
+                    json.loads(messages[1].model_dump_json()),
+                ],
+                "temperature": 1.0,
+                "top_p": 1,
+                "frequency_penalty": 0,
+                "presence_penalty": 0,
+                "max_tokens": 160,
+                "stream": False,
+                "guardrails": {
+                    "config_id": "self-check",
+                },
+            },
+        )
+
+        # Verify the result
+        assert isinstance(result, RunShieldResponse)
+        assert result.violation is None
+
+    def test_run_shield_blocked(self):
+        # Set up the shield
+        shield_id = "test-shield"
+        shield = Shield(
+            provider_id="nvidia",
+            type="shield",
+            identifier=shield_id,
+            provider_resource_id="test-model",
+        )
+        self.shield_store.get_shield.return_value = shield
+
+        # Mock Guardrails API response
+        self.mock_guardrails_post.return_value = {"status": "blocked", "rails_status": {"reason": "harmful_content"}}
+
+        # Run the shield
+        messages = [
+            UserMessage(role="user", content="Hello, how are you?"),
+            CompletionMessage(
+                role="assistant",
+                content="I'm doing well, thank you for asking!",
+                stop_reason="end_of_message",
+                tool_calls=[],
+            ),
+        ]
+        result = self.run_async(self.adapter.run_shield(shield_id, messages))
+
+        # Verify the shield store was called
+        self.shield_store.get_shield.assert_called_once_with(shield_id)
+
+        # Verify the Guardrails API was called correctly
+        self.mock_guardrails_post.assert_called_once_with(
+            path="/v1/guardrail/checks",
+            data={
+                "model": shield_id,
+                "messages": [
+                    json.loads(messages[0].model_dump_json()),
+                    json.loads(messages[1].model_dump_json()),
+                ],
+                "temperature": 1.0,
+                "top_p": 1,
+                "frequency_penalty": 0,
+                "presence_penalty": 0,
+                "max_tokens": 160,
+                "stream": False,
+                "guardrails": {
+                    "config_id": "self-check",
+                },
+            },
+        )
+
+        # Verify the result
+        assert result.violation is not None
+        assert isinstance(result, RunShieldResponse)
+        assert result.violation.user_message == "Sorry I cannot do this."
+        assert result.violation.violation_level == ViolationLevel.ERROR
+        assert result.violation.metadata == {"reason": "harmful_content"}
+
+    def test_run_shield_not_found(self):
+        # Set up shield store to return None
+        shield_id = "non-existent-shield"
+        self.shield_store.get_shield.return_value = None
+
+        messages = [
+            UserMessage(role="user", content="Hello, how are you?"),
+        ]
+
+        with self.assertRaises(ValueError):
+            self.run_async(self.adapter.run_shield(shield_id, messages))
+
+        # Verify the shield store was called
+        self.shield_store.get_shield.assert_called_once_with(shield_id)
+
+        # Verify the Guardrails API was not called
+        self.mock_guardrails_post.assert_not_called()
+
+    def test_run_shield_http_error(self):
+        shield_id = "test-shield"
+        shield = Shield(
+            provider_id="nvidia",
+            type="shield",
+            identifier=shield_id,
+            provider_resource_id="test-model",
+        )
+        self.shield_store.get_shield.return_value = shield
+
+        # Mock Guardrails API to raise an exception
+        error_msg = "API Error: 500 Internal Server Error"
+        self.mock_guardrails_post.side_effect = Exception(error_msg)
+
+        # Running the shield should raise an exception
+        messages = [
+            UserMessage(role="user", content="Hello, how are you?"),
+            CompletionMessage(
+                role="assistant",
+                content="I'm doing well, thank you for asking!",
+                stop_reason="end_of_message",
+                tool_calls=[],
+            ),
+        ]
+        with self.assertRaises(Exception) as context:
+            self.run_async(self.adapter.run_shield(shield_id, messages))
+
+        # Verify the shield store was called
+        self.shield_store.get_shield.assert_called_once_with(shield_id)
+
+        # Verify the Guardrails API was called correctly
+        self.mock_guardrails_post.assert_called_once_with(
+            path="/v1/guardrail/checks",
+            data={
+                "model": shield_id,
+                "messages": [
+                    json.loads(messages[0].model_dump_json()),
+                    json.loads(messages[1].model_dump_json()),
+                ],
+                "temperature": 1.0,
+                "top_p": 1,
+                "frequency_penalty": 0,
+                "presence_penalty": 0,
+                "max_tokens": 160,
+                "stream": False,
+                "guardrails": {
+                    "config_id": "self-check",
+                },
+            },
+        )
+        # Verify the exception message
+        assert error_msg in str(context.exception)
+
+    def test_init_nemo_guardrails(self):
+        from llama_stack.providers.remote.safety.nvidia.nvidia import NeMoGuardrails
+
+        test_config_id = "test-custom-config-id"
+        config = NVIDIASafetyConfig(
+            guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
+            config_id=test_config_id,
+        )
+        # Initialize with default parameters
+        test_model = "test-model"
+        guardrails = NeMoGuardrails(config, test_model)
+
+        # Verify the attributes are set correctly
+        assert guardrails.config_id == test_config_id
+        assert guardrails.model == test_model
+        assert guardrails.threshold == 0.9  # Default value
+        assert guardrails.temperature == 1.0  # Default value
+        assert guardrails.guardrails_service_url == os.environ["NVIDIA_GUARDRAILS_URL"]
+
+        # Initialize with custom parameters
+        guardrails = NeMoGuardrails(config, test_model, threshold=0.8, temperature=0.7)
+
+        # Verify the attributes are set correctly
+        assert guardrails.config_id == test_config_id
+        assert guardrails.model == test_model
+        assert guardrails.threshold == 0.8
+        assert guardrails.temperature == 0.7
+        assert guardrails.guardrails_service_url == os.environ["NVIDIA_GUARDRAILS_URL"]
+
+    def test_init_nemo_guardrails_invalid_temperature(self):
+        from llama_stack.providers.remote.safety.nvidia.nvidia import NeMoGuardrails
+
+        config = NVIDIASafetyConfig(
+            guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
+            config_id="test-custom-config-id",
+        )
+        with self.assertRaises(ValueError):
+            NeMoGuardrails(config, "test-model", temperature=0)
--- a/tests/verifications/REPORT.md
+++ b/tests/verifications/REPORT.md
@ -1,6 +1,6 @@
 # Test Results Report

-*Generated on: 2025-04-08 21:14:02*
+*Generated on: 2025-04-10 16:48:18*

 *This report was generated by running `python tests/verifications/generate_report.py`*

@ -15,74 +15,118 @@

 | Provider | Pass Rate | Tests Passed | Total Tests |
 | --- | --- | --- | --- |
-| Together | 67.7% | 21 | 31 |
-| Fireworks | 90.3% | 28 | 31 |
-| Openai | 100.0% | 22 | 22 |
+| Together | 64.7% | 22 | 34 |
+| Fireworks | 82.4% | 28 | 34 |
+| Openai | 100.0% | 24 | 24 |



 ## Together

-*Tests run on: 2025-04-08 16:19:59*
+*Tests run on: 2025-04-10 16:46:35*

 ```bash
-pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
+# Run all tests for this provider:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
+
+# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_non_streaming_basic and earth"
 ```

-| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
+
+**Model Key (Together)**
+
+| Display Name | Full Model ID |
+| --- | --- |
+| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
+| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
+| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
+
+
+| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
 | --- | --- | --- | --- |
-| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
-| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
-| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
-| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
-| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
-| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
+| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
+| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
+| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
+| test_chat_streaming_image | ⚪ | ❌ | ❌ |
+| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
+| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
+| test_chat_streaming_tool_calling | ✅ | ❌ | ❌ |

 ## Fireworks

-*Tests run on: 2025-04-08 16:18:28*
+*Tests run on: 2025-04-10 16:44:44*

 ```bash
-pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
+# Run all tests for this provider:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
+
+# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_non_streaming_basic and earth"
 ```

-| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
+
+**Model Key (Fireworks)**
+
+| Display Name | Full Model ID |
+| --- | --- |
+| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
+| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
+| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
+
+
+| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
 | --- | --- | --- | --- |
-| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
-| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
-| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
-| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
-| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
-| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
+| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
+| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
+| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
+| test_chat_streaming_image | ⚪ | ✅ | ✅ |
+| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
+| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
+| test_chat_streaming_tool_calling | ❌ | ❌ | ❌ |

 ## Openai

-*Tests run on: 2025-04-08 16:22:02*
+*Tests run on: 2025-04-10 16:47:28*

 ```bash
-pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
+# Run all tests for this provider:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
+
+# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
+pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_non_streaming_basic and earth"
 ```

+
+**Model Key (Openai)**
+
+| Display Name | Full Model ID |
+| --- | --- |
+| gpt-4o | `gpt-4o` |
+| gpt-4o-mini | `gpt-4o-mini` |
+
+
 | Test | gpt-4o | gpt-4o-mini |
 | --- | --- | --- |
-| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
-| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
-| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
-| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
-| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
-| test_chat_streaming_basic (case 0) | ✅ | ✅ |
-| test_chat_streaming_basic (case 1) | ✅ | ✅ |
-| test_chat_streaming_image (case 0) | ✅ | ✅ |
-| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
-| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
+| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
+| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
+| test_chat_non_streaming_image | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
+| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
+| test_chat_non_streaming_tool_calling | ✅ | ✅ |
+| test_chat_streaming_basic (earth) | ✅ | ✅ |
+| test_chat_streaming_basic (saturn) | ✅ | ✅ |
+| test_chat_streaming_image | ✅ | ✅ |
+| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
+| test_chat_streaming_structured_output (math) | ✅ | ✅ |
+| test_chat_streaming_tool_calling | ✅ | ✅ |
--- a/tests/verifications/conf/cerebras.yaml
+++ b/tests/verifications/conf/cerebras.yaml
@ -0,0 +1,10 @@
+base_url: https://api.cerebras.ai/v1
+api_key_var: CEREBRAS_API_KEY
+models:
+- llama-3.3-70b
+model_display_names:
+  llama-3.3-70b: Llama-3.3-70B-Instruct
+test_exclusions:
+  llama-3.3-70b:
+  - test_chat_non_streaming_image
+  - test_chat_streaming_image
--- a/tests/verifications/conf/fireworks.yaml
+++ b/tests/verifications/conf/fireworks.yaml
@ -0,0 +1,14 @@
+base_url: https://api.fireworks.ai/inference/v1
+api_key_var: FIREWORKS_API_KEY
+models:
+- accounts/fireworks/models/llama-v3p3-70b-instruct
+- accounts/fireworks/models/llama4-scout-instruct-basic
+- accounts/fireworks/models/llama4-maverick-instruct-basic
+model_display_names:
+  accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
+  accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
+  accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
+test_exclusions:
+  accounts/fireworks/models/llama-v3p3-70b-instruct:
+  - test_chat_non_streaming_image
+  - test_chat_streaming_image
--- a/tests/verifications/conf/groq.yaml
+++ b/tests/verifications/conf/groq.yaml
@ -0,0 +1,14 @@
+base_url: https://api.groq.com/openai/v1
+api_key_var: GROQ_API_KEY
+models:
+- llama-3.3-70b-versatile
+- llama-4-scout-17b-16e-instruct
+- llama-4-maverick-17b-128e-instruct
+model_display_names:
+  llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
+  llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
+  llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
+test_exclusions:
+  llama-3.3-70b-versatile:
+  - test_chat_non_streaming_image
+  - test_chat_streaming_image
--- a/tests/verifications/conf/openai.yaml
+++ b/tests/verifications/conf/openai.yaml
@ -0,0 +1,9 @@
+base_url: https://api.openai.com/v1
+api_key_var: OPENAI_API_KEY
+models:
+- gpt-4o
+- gpt-4o-mini
+model_display_names:
+  gpt-4o: gpt-4o
+  gpt-4o-mini: gpt-4o-mini
+test_exclusions: {}
--- a/tests/verifications/conf/together.yaml
+++ b/tests/verifications/conf/together.yaml
@ -0,0 +1,14 @@
+base_url: https://api.together.xyz/v1
+api_key_var: TOGETHER_API_KEY
+models:
+- meta-llama/Llama-3.3-70B-Instruct-Turbo
+- meta-llama/Llama-4-Scout-17B-16E-Instruct
+- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+model_display_names:
+  meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
+  meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
+  meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
+test_exclusions:
+  meta-llama/Llama-3.3-70B-Instruct-Turbo:
+  - test_chat_non_streaming_image
+  - test_chat_streaming_image
--- a/tests/verifications/conftest.py
+++ b/tests/verifications/conftest.py
@ -4,6 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import re
+
+import pytest
+

 def pytest_addoption(parser):
    parser.addoption(
@ -14,7 +18,7 @@ def pytest_addoption(parser):
    parser.addoption(
        "--api-key",
        action="store",
-        help="API key",
+        help="API key to use for the provider",
    )
    parser.addoption(
        "--provider",
@ -24,5 +28,64 @@ def pytest_addoption(parser):


 pytest_plugins = [
-    "tests.verifications.openai.fixtures.fixtures",
+    "pytest_jsonreport",
+    "tests.verifications.openai_api.fixtures.fixtures",
+    "tests.verifications.openai_api.fixtures.load",
 ]
+
+
+@pytest.hookimpl(optionalhook=True)
+def pytest_json_runtest_metadata(item, call):
+    """Add model and case_id to pytest-json report metadata."""
+    metadata = {}
+    nodeid = item.nodeid
+
+    # 1. Extract model from callspec if available
+    model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
+    if model:
+        metadata["model"] = model
+    else:
+        # Fallback: Try parsing from nodeid (less reliable)
+        match_model = re.search(r"\[(.*?)-", nodeid)
+        if match_model:
+            model = match_model.group(1)  # Store model even if found via fallback
+            metadata["model"] = model
+        else:
+            print(f"Warning: Could not determine model for test {nodeid}")
+            model = None  # Ensure model is None if not found
+
+    # 2. Extract case_id using the known model string if possible
+    if model:
+        # Construct a regex pattern to find the case_id *after* the model name and a hyphen.
+        # Escape the model name in case it contains regex special characters.
+        pattern = re.escape(model) + r"-(.*?)\]$"
+        match_case = re.search(pattern, nodeid)
+        if match_case:
+            case_id = match_case.group(1)
+            metadata["case_id"] = case_id
+        else:
+            # Fallback if the pattern didn't match (e.g., nodeid format unexpected)
+            # Try the old less specific regex as a last resort.
+            match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
+            if match_case_fallback:
+                case_id = match_case_fallback.group(1)
+                metadata["case_id"] = case_id
+                print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
+            else:
+                print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
+                if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
+                    metadata["case_id"] = "parsing_failed"
+    elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
+        # Cannot reliably parse case_id without model, but we know it's a case test.
+        # Try the generic fallback regex.
+        match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
+        if match_case_fallback:
+            case_id = match_case_fallback.group(1)
+            metadata["case_id"] = case_id
+            print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
+        else:
+            print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
+            metadata["case_id"] = "parsing_failed_no_model"
+    # else: Not a test with a model or case param we need to handle.
+
+    return metadata
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@ -4,27 +4,48 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pytest-json-report",
+#     "pyyaml",
+# ]
+# ///
 """
 Test Report Generator

-Requirements:
-    pip install pytest-json-report
+Description:
+    This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
+    for different providers, aggregates the results from JSON reports, and generates
+    a markdown summary report (REPORT.md).
+
+    It automatically cleans up old test result files, keeping only the latest
+    per provider.
+
+
+Configuration:
+    - Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
+    - Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
+    - Test results are stored in `tests/verifications/test_results/`.

 Usage:
-    # Generate a report using existing test results
+    # Generate a report using the latest existing test results
    python tests/verifications/generate_report.py

-    # Run tests and generate a report
+    # Run tests for all configured providers and generate a report
    python tests/verifications/generate_report.py --run-tests

-    # Run tests for specific providers
+    # Run tests only for specific providers (space-separated)
    python tests/verifications/generate_report.py --run-tests --providers fireworks openai

+    # Run tests matching a keyword expression (uses pytest -k)
+    python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
+
+    # Run a specific test case for a provider
+    python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
+
    # Save the report to a custom location
    python tests/verifications/generate_report.py --output custom_report.md
-
-    # Clean up old test result files
-    python tests/verifications/generate_report.py --cleanup
 """

 import argparse
@ -35,6 +56,9 @@ import subprocess
 import time
 from collections import defaultdict
 from pathlib import Path
+from typing import Any, DefaultDict, Dict, Set, Tuple
+
+from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs

 # Define the root directory for test results
 RESULTS_DIR = Path(__file__).parent / "test_results"
@ -43,47 +67,52 @@ RESULTS_DIR.mkdir(exist_ok=True)
 # Maximum number of test result files to keep per provider
 MAX_RESULTS_PER_PROVIDER = 1

-# Custom order of providers
 PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]

-# Dictionary to store providers and their models (will be populated dynamically)
-PROVIDERS = defaultdict(set)
-
-# Tests will be dynamically extracted from results
-ALL_TESTS = set()
+VERIFICATION_CONFIG = _load_all_verification_configs()


-def run_tests(provider):
+def run_tests(provider, keyword=None):
    """Run pytest for a specific provider and save results"""
    print(f"Running tests for provider: {provider}")

    timestamp = int(time.time())
-    result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
-    temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
+    # Use a constant filename for the final result and temp file
+    result_file = RESULTS_DIR / f"{provider}.json"
+    temp_json_file = RESULTS_DIR / f"temp_{provider}.json"
+
+    # Determine project root directory relative to this script
+    project_root = Path(__file__).parent.parent.parent

    # Run pytest with JSON output
    cmd = [
        "python",
        "-m",
        "pytest",
-        "tests/verifications/openai/test_chat_completion.py",
+        "tests/verifications/openai_api/test_chat_completion.py",
        f"--provider={provider}",
        "-v",
        "--json-report",
        f"--json-report-file={temp_json_file}",
    ]

+    # Append -k argument if provided
+    if keyword:
+        cmd.extend(["-k", keyword])
+
    try:
-        result = subprocess.run(cmd, capture_output=True, text=True)
+        # Run subprocess with cwd set to project root
+        result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
        print(f"Pytest exit code: {result.returncode}")

        # Check if the JSON file was created
        if temp_json_file.exists():
-            # Read the JSON file and save it to our results format
            with open(temp_json_file, "r") as f:
                test_results = json.load(f)

-            # Save results to our own format with a trailing newline
+            test_results["run_timestamp"] = timestamp
+
+            # Save results to the final (overwritten) file
            with open(result_file, "w") as f:
                json.dump(test_results, f, indent=2)
                f.write("\n")  # Add a trailing newline for precommit
@ -103,18 +132,40 @@ def run_tests(provider):
        return None


-def parse_results(result_file):
-    """Parse the test results file and extract pass/fail by model and test"""
+def parse_results(
+    result_file,
+) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
+    """Parse a single test results file.
+
+    Returns:
+        Tuple containing:
+        - parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
+        - providers_in_file: DefaultDict[provider, Set[model]] found in this file.
+        - tests_in_file: Set[test_name] found in this file.
+        - run_timestamp: Timestamp when the test was run
+    """
    if not os.path.exists(result_file):
        print(f"Results file does not exist: {result_file}")
-        return {}
+        # Return empty defaultdicts/set matching the type hint
+        return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""

    with open(result_file, "r") as f:
        results = json.load(f)

-    # Initialize results dictionary
-    parsed_results = defaultdict(lambda: defaultdict(dict))
-    provider = os.path.basename(result_file).split("_")[0]
+    # Initialize results dictionary with specific types
+    parsed_results: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
+    providers_in_file: DefaultDict[str, Set[str]] = defaultdict(set)
+    tests_in_file: Set[str] = set()
+    # Extract provider from filename (e.g., "openai.json" -> "openai")
+    provider: str = result_file.stem
+
+    # Extract run timestamp from the JSON data
+    run_timestamp_unix = results.get("run_timestamp")
+    run_timestamp_str = (
+        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_timestamp_unix))
+        if run_timestamp_unix is not None
+        else "Unknown"
+    )

    # Debug: Print summary of test results
    print(f"Test results summary for {provider}:")
@ -127,195 +178,131 @@ def parse_results(result_file):
    # Extract test results
    if "tests" not in results or not results["tests"]:
        print(f"No test results found in {result_file}")
-        return parsed_results
+        # Return empty defaultdicts/set matching the type hint
+        return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""

-    # Map for normalizing model names
-    model_name_map = {
-        "Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct",
-        "Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct",
-        "Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
-        "Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct",
-        "Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct",
-        "Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct",
-        "Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct",
-        "gpt-4o": "gpt-4o",
-        "gpt-4o-mini": "gpt-4o-mini",
-    }
-
-    # Keep track of all models found for this provider
-    provider_models = set()
-
-    # Track all unique test cases for each base test
-    test_case_counts = defaultdict(int)
-
-    # First pass: count the number of cases for each test
+    # Process the tests
    for test in results["tests"]:
        test_id = test.get("nodeid", "")

-        if "call" in test:
-            test_name = test_id.split("::")[1].split("[")[0]
-            input_output_match = re.search(r"\[input_output(\d+)-", test_id)
-            if input_output_match:
-                test_case_counts[test_name] += 1
+        if not (call_phase := test.get("call")):
+            continue
+        call_outcome = call_phase.get("outcome")
+        if call_outcome not in ("passed", "failed"):
+            continue

-    # Second pass: process the tests with case numbers only for tests with multiple cases
-    for test in results["tests"]:
-        test_id = test.get("nodeid", "")
-        outcome = test.get("outcome", "")
+        # --- Extract data from metadata ---
+        metadata = test.get("metadata", {})
+        model = metadata.get("model")
+        case_id = metadata.get("case_id")  # String ID (if provided)
+        case_index = metadata.get("case_index")  # Integer index (if no ID provided)

-        # Only process tests that have been executed (not setup errors)
-        if "call" in test:
-            # Regular test that actually ran
-            test_name = test_id.split("::")[1].split("[")[0]
+        # Check if we have a model and at least one case identifier
+        if not model or (case_id is None and case_index is None):
+            print(
+                f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
+            )
+            continue

-            # Extract input_output parameter to differentiate between test cases
-            input_output_match = re.search(r"\[input_output(\d+)-", test_id)
-            input_output_index = input_output_match.group(1) if input_output_match else ""
+        try:
+            test_name_base = test_id.split("::")[1].split("[")[0]
+        except (IndexError, ValueError) as e:
+            print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
+            continue

-            # Create a more detailed test name with case number only if there are multiple cases
-            detailed_test_name = test_name
-            if input_output_index and test_case_counts[test_name] > 1:
-                detailed_test_name = f"{test_name} (case {input_output_index})"
+        # Construct detailed test name using ID or index
+        if case_id is not None:
+            detailed_test_name = f"{test_name_base} ({case_id})"
+        elif case_index == 0:
+            # If case_id is missing and index is 0, assume single case, use base name only
+            detailed_test_name = test_name_base
+        elif case_index is not None:  # case_index > 0
+            # Use case_index for naming if case_id wasn't provided and index > 0
+            detailed_test_name = f"{test_name_base} (case{case_index})"
+        else:
+            # This case should be prevented by the earlier check, but handle defensively
+            print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
+            continue

-            # Track all unique test names
-            ALL_TESTS.add(detailed_test_name)
+        # Populate collections for this file
+        tests_in_file.add(detailed_test_name)
+        providers_in_file[provider].add(model)

-            # Extract model name from test_id using a more robust pattern
-            model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
-            if model_match:
-                raw_model = model_match.group(1)
-                model = model_name_map.get(raw_model, raw_model)
+        if call_outcome == "passed":
+            parsed_results[provider][model][detailed_test_name] = True
+        elif call_outcome == "failed":
+            parsed_results[provider][model][detailed_test_name] = False

-                # Add to set of known models for this provider
-                provider_models.add(model)
+    # Final Summary Warning (Optional)
+    if not parsed_results.get(provider):
+        print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")

-                # Also update the global PROVIDERS dictionary
-                PROVIDERS[provider].add(model)
-
-                # Store the result
-                if outcome == "passed":
-                    parsed_results[provider][model][detailed_test_name] = True
-                else:
-                    parsed_results[provider][model][detailed_test_name] = False
-
-                print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}")
-        elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed":
-            # This is a setup failure, which likely means a configuration issue
-            # Extract the base test name and model name
-            parts = test_id.split("::")
-            if len(parts) > 1:
-                test_name = parts[1].split("[")[0]
-
-                # Extract input_output parameter to differentiate between test cases
-                input_output_match = re.search(r"\[input_output(\d+)-", test_id)
-                input_output_index = input_output_match.group(1) if input_output_match else ""
-
-                # Create a more detailed test name with case number only if there are multiple cases
-                detailed_test_name = test_name
-                if input_output_index and test_case_counts[test_name] > 1:
-                    detailed_test_name = f"{test_name} (case {input_output_index})"
-
-                if detailed_test_name in ALL_TESTS:
-                    # Use a more robust pattern for model extraction
-                    model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
-                    if model_match:
-                        raw_model = model_match.group(1)
-                        model = model_name_map.get(raw_model, raw_model)
-
-                        # Add to set of known models for this provider
-                        provider_models.add(model)
-
-                        # Also update the global PROVIDERS dictionary
-                        PROVIDERS[provider].add(model)
-
-                        # Mark setup failures as false (failed)
-                        parsed_results[provider][model][detailed_test_name] = False
-                        print(f"Parsed setup failure: {detailed_test_name} for model {model}")
-
-    # Debug: Print parsed results
-    if not parsed_results[provider]:
-        print(f"Warning: No test results parsed for provider {provider}")
-    else:
-        for model, tests in parsed_results[provider].items():
-            print(f"Model {model}: {len(tests)} test results")
-
-    return parsed_results
+    return parsed_results, providers_in_file, tests_in_file, run_timestamp_str


-def cleanup_old_results():
-    """Clean up old test result files, keeping only the newest N per provider"""
-    for provider in PROVIDERS.keys():
-        # Get all result files for this provider
-        provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
-
-        # Sort by timestamp (newest first)
-        provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
-
-        # Remove old files beyond the max to keep
-        if len(provider_files) > MAX_RESULTS_PER_PROVIDER:
-            for old_file in provider_files[MAX_RESULTS_PER_PROVIDER:]:
-                try:
-                    old_file.unlink()
-                    print(f"Removed old result file: {old_file}")
-                except Exception as e:
-                    print(f"Error removing file {old_file}: {e}")
-
-
-def get_latest_results_by_provider():
-    """Get the latest test result file for each provider"""
+def get_all_result_files_by_provider():
+    """Get all test result files, keyed by provider."""
    provider_results = {}

-    # Get all result files
    result_files = list(RESULTS_DIR.glob("*.json"))

-    # Extract all provider names from filenames
-    all_providers = set()
    for file in result_files:
-        # File format is provider_timestamp.json
-        parts = file.stem.split("_")
-        if len(parts) >= 2:
-            all_providers.add(parts[0])
-
-    # Group by provider
-    for provider in all_providers:
-        provider_files = [f for f in result_files if f.name.startswith(f"{provider}_")]
-
-        # Sort by timestamp (newest first)
-        provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
-
-        if provider_files:
-            provider_results[provider] = provider_files[0]
+        provider = file.stem
+        if provider:
+            provider_results[provider] = file

    return provider_results


-def generate_report(results_dict, output_file=None):
-    """Generate the markdown report"""
+def generate_report(
+    results_dict: Dict[str, Any],
+    providers: Dict[str, Set[str]],
+    all_tests: Set[str],
+    provider_timestamps: Dict[str, str],
+    output_file=None,
+):
+    """Generate the markdown report.
+
+    Args:
+        results_dict: Aggregated results [provider][model][test_name] -> status.
+        providers: Dict of all providers and their models {provider: {models}}.
+        all_tests: Set of all test names found.
+        provider_timestamps: Dict of provider to timestamp when tests were run
+        output_file: Optional path to save the report.
+    """
    if output_file is None:
        # Default to creating the report in the same directory as this script
        output_file = Path(__file__).parent / "REPORT.md"
    else:
        output_file = Path(output_file)

-    # Get the timestamp from result files
-    provider_timestamps = {}
-    provider_results = get_latest_results_by_provider()
-    for provider, result_file in provider_results.items():
-        # Extract timestamp from filename (format: provider_timestamp.json)
-        try:
-            timestamp_str = result_file.stem.split("_")[1]
-            timestamp = int(timestamp_str)
-            formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
-            provider_timestamps[provider] = formatted_time
-        except (IndexError, ValueError):
-            provider_timestamps[provider] = "Unknown"
+    # Convert provider model sets to sorted lists (use passed-in providers dict)
+    providers_sorted = {prov: sorted(models) for prov, models in providers.items()}

-    # Convert provider model sets to sorted lists
-    for provider in PROVIDERS:
-        PROVIDERS[provider] = sorted(PROVIDERS[provider])
+    # Sort tests alphabetically (use passed-in all_tests set)
+    sorted_tests = sorted(all_tests)

-    # Sort tests alphabetically
-    sorted_tests = sorted(ALL_TESTS)
+    # Calculate counts for each base test name
+    base_test_case_counts: DefaultDict[str, int] = defaultdict(int)
+    base_test_name_map: Dict[str, str] = {}
+    for test_name in sorted_tests:
+        match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
+        if match:
+            base_name = match.group(1).strip()
+            base_test_case_counts[base_name] += 1
+            base_test_name_map[test_name] = base_name
+        else:
+            # Should not happen with current naming, but handle defensively
+            base_test_case_counts[test_name] += 1
+            base_test_name_map[test_name] = test_name
+
+    if not sorted_tests:
+        print("Warning: No test results found to generate a report.")
+        # Optionally create an empty report or return early
+        with open(output_file, "w") as f:
+            f.write("# Test Results Report\n\nNo test results found.\n")
+        print(f"Generated empty report: {output_file}")
+        return

    report = ["# Test Results Report\n"]
    report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
@ -336,19 +323,15 @@ def generate_report(results_dict, output_file=None):
    # Add a summary section
    report.append("## Summary\n")

-    # Count total tests and passes
+    # Count total tests and passes (use passed-in providers and all_tests)
    total_tests = 0
    passed_tests = 0
    provider_totals = {}
-
-    # Prepare summary data
-    for provider in PROVIDERS.keys():
+    for provider, models in providers_sorted.items():
        provider_passed = 0
        provider_total = 0
-
        if provider in results_dict:
-            provider_models = PROVIDERS[provider]
-            for model in provider_models:
+            for model in models:
                if model in results_dict[provider]:
                    model_results = results_dict[provider][model]
                    for test in sorted_tests:
@ -358,33 +341,26 @@ def generate_report(results_dict, output_file=None):
                            if model_results[test]:
                                provider_passed += 1
                                passed_tests += 1
-
        provider_totals[provider] = (provider_passed, provider_total)

-    # Add summary table
+    # Add summary table (use passed-in providers dict)
    report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
    report.append("| --- | --- | --- | --- |")
-
-    # Use the custom order for summary table
-    for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]:
+    for provider in [p for p in PROVIDER_ORDER if p in providers]:  # Check against keys of passed-in dict
        passed, total = provider_totals.get(provider, (0, 0))
        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
-
-    # Add providers not in the custom order
-    for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]:
+    for provider in [p for p in providers if p not in PROVIDER_ORDER]:  # Check against keys of passed-in dict
        passed, total = provider_totals.get(provider, (0, 0))
        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
-
    report.append("\n")

-    # Process each provider in the custom order, then any additional providers
    for provider in sorted(
-        PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
+        providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
    ):
-        if not PROVIDERS[provider]:
-            # Skip providers with no models
+        provider_models = providers_sorted[provider]  # Use sorted models
+        if not provider_models:
            continue

        report.append(f"\n## {provider.capitalize()}\n")
@ -394,34 +370,70 @@ def generate_report(results_dict, output_file=None):
            report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")

        # Add test command for reproducing results
-        test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v"
-        report.append(f"```bash\n{test_cmd}\n```\n")
+        test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
+        report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")

-        # Get the relevant models for this provider
-        provider_models = PROVIDERS[provider]
+        # Find an example test with a case ID
+        example_base_test_name = None
+        example_case_id = None
+        # Get first test as fallback base, handle empty list
+        first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"

-        # Create table header with models as columns
-        header = "| Test | " + " | ".join(provider_models) + " |"
+        match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
+        if match:
+            example_base_test_name = match.group(1).strip()
+            example_case_id = match.group(2).strip()
+        else:
+            example_base_test_name = first_test_name
+
+        base_name = base_test_name_map.get(first_test_name, first_test_name)  # Get base name
+        case_count = base_test_case_counts.get(base_name, 1)  # Get count
+        filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
+
+        test_cmd_specific_case = (
+            f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
+        )
+        report.append(
+            f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
+        )
+
+        # Get display names (use passed-in providers dict)
+        provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
+        display_name_map = provider_config.get("model_display_names", {})
+
+        # Add Model Key Table (use provider_models)
+        report.append(f"\n**Model Key ({provider.capitalize()})**\n")
+        provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
+        for model_id in provider_models:
+            display_name = display_name_map.get(model_id, model_id)
+            provider_key_lines.append(f"| {display_name} | `{model_id}` |")
+        report.extend(provider_key_lines)
+        report.append("\n")
+
+        # Create results table header (use provider_models)
+        display_names = [display_name_map.get(m, m) for m in provider_models]
+        header = "| Test | " + " | ".join(display_names) + " |"
        separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
-
        report.append(header)
        report.append(separator)

-        # Get results for this provider
-        provider_results = results_dict.get(provider, {})
+        # Get results for this provider from results_dict
+        provider_results_data = results_dict.get(provider, {})

-        # Add rows for each test
+        # Add rows for each test (use sorted_tests)
        for test in sorted_tests:
-            row = f"| {test} |"
+            # Determine display name based on case count
+            base_name = base_test_name_map.get(test, test)  # Get base name
+            case_count = base_test_case_counts.get(base_name, 1)  # Get count
+            display_test_name = base_name if case_count == 1 else test  # Choose display name
+            row = f"| {display_test_name} |"  # Use display name

-            # Add results for each model in this test
-            for model in provider_models:
-                if model in provider_results and test in provider_results[model]:
-                    result = pass_icon if provider_results[model][test] else fail_icon
+            for model_id in provider_models:
+                if model_id in provider_results_data and test in provider_results_data[model_id]:
+                    result = pass_icon if provider_results_data[model_id][test] else fail_icon
                else:
                    result = na_icon
                row += f" {result} |"
-
            report.append(row)

    # Write to file
@ -442,9 +454,14 @@ def main():
        help="Specify providers to test (comma-separated or space-separated, default: all)",
    )
    parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
+    parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
    args = parser.parse_args()

    all_results = {}
+    # Initialize collections to aggregate results in main
+    aggregated_providers = defaultdict(set)
+    aggregated_tests = set()
+    provider_timestamps = {}

    if args.run_tests:
        # Get list of available providers from command line or use detected providers
@ -463,22 +480,31 @@ def main():

        for provider in test_providers:
            provider = provider.strip()  # Remove any whitespace
-            result_file = run_tests(provider)
+            result_file = run_tests(provider, keyword=args.k)
            if result_file:
-                provider_results = parse_results(result_file)
-                all_results.update(provider_results)
+                # Parse and aggregate results
+                parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
+                all_results.update(parsed_results)
+                for prov, models in providers_in_file.items():
+                    aggregated_providers[prov].update(models)
+                    if run_timestamp:
+                        provider_timestamps[prov] = run_timestamp
+                aggregated_tests.update(tests_in_file)
    else:
        # Use existing results
-        provider_result_files = get_latest_results_by_provider()
+        provider_result_files = get_all_result_files_by_provider()

        for result_file in provider_result_files.values():
-            provider_results = parse_results(result_file)
-            all_results.update(provider_results)
+            # Parse and aggregate results
+            parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
+            all_results.update(parsed_results)
+            for prov, models in providers_in_file.items():
+                aggregated_providers[prov].update(models)
+                if run_timestamp:
+                    provider_timestamps[prov] = run_timestamp
+            aggregated_tests.update(tests_in_file)

-    # Generate the report
-    generate_report(all_results, args.output)
-
-    cleanup_old_results()
+    generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)


 if __name__ == "__main__":
--- a/tests/verifications/openai/fixtures/fixtures.py
+++ b/tests/verifications/openai/fixtures/fixtures.py
@ -1,97 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-import pytest
-from openai import OpenAI
-
-
-@pytest.fixture
-def providers_model_mapping():
-    """
-    Mapping from model names used in test cases to provider's model names.
-    """
-    return {
-        "fireworks": {
-            "Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct",
-            "Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
-            "Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic",
-            "Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic",
-        },
-        "together": {
-            "Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
-            "Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
-            "Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-            "Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        },
-        "groq": {
-            "Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile",
-            "Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview",
-            "Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
-            "Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct",
-        },
-        "cerebras": {
-            "Llama-3.3-70B-Instruct": "llama-3.3-70b",
-        },
-        "openai": {
-            "gpt-4o": "gpt-4o",
-            "gpt-4o-mini": "gpt-4o-mini",
-        },
-    }
-
-
-@pytest.fixture
-def provider_metadata():
-    return {
-        "fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"),
-        "together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"),
-        "groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"),
-        "cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"),
-        "openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"),
-    }
-
-
-@pytest.fixture
-def provider(request, provider_metadata):
-    provider = request.config.getoption("--provider")
-    base_url = request.config.getoption("--base-url")
-
-    if provider and base_url and provider_metadata[provider][0] != base_url:
-        raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
-
-    if not provider:
-        if not base_url:
-            raise ValueError("Provider and base URL are not provided")
-        for provider, metadata in provider_metadata.items():
-            if metadata[0] == base_url:
-                provider = provider
-                break
-
-    return provider
-
-
-@pytest.fixture
-def base_url(request, provider, provider_metadata):
-    return request.config.getoption("--base-url") or provider_metadata[provider][0]
-
-
-@pytest.fixture
-def api_key(request, provider, provider_metadata):
-    return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1])
-
-
-@pytest.fixture
-def model_mapping(provider, providers_model_mapping):
-    return providers_model_mapping[provider]
-
-
-@pytest.fixture
-def openai_client(base_url, api_key):
-    return OpenAI(
-        base_url=base_url,
-        api_key=api_key,
-    )
--- a/tests/verifications/openai/test_chat_completion.py
+++ b/tests/verifications/openai/test_chat_completion.py
@ -1,202 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-import pytest
-from pydantic import BaseModel
-
-from tests.verifications.openai.fixtures.load import load_test_cases
-
-chat_completion_test_cases = load_test_cases("chat_completion")
-
-
-@pytest.fixture
-def correct_model_name(model, provider, providers_model_mapping):
-    """Return the provider-specific model name based on the generic model name."""
-    mapping = providers_model_mapping[provider]
-    if model not in mapping:
-        pytest.skip(f"Provider {provider} does not support model {model}")
-    return mapping[model]
-
-
-@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
-)
-def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        stream=False,
-    )
-    assert response.choices[0].message.role == "assistant"
-    assert input_output["output"].lower() in response.choices[0].message.content.lower()
-
-
-@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
-)
-def test_chat_streaming_basic(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        stream=True,
-    )
-    content = ""
-    for chunk in response:
-        content += chunk.choices[0].delta.content or ""
-
-    # TODO: add detailed type validation
-
-    assert input_output["output"].lower() in content.lower()
-
-
-@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
-)
-def test_chat_non_streaming_image(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        stream=False,
-    )
-    assert response.choices[0].message.role == "assistant"
-    assert input_output["output"].lower() in response.choices[0].message.content.lower()
-
-
-@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
-)
-def test_chat_streaming_image(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        stream=True,
-    )
-    content = ""
-    for chunk in response:
-        content += chunk.choices[0].delta.content or ""
-
-    # TODO: add detailed type validation
-
-    assert input_output["output"].lower() in content.lower()
-
-
-@pytest.mark.parametrize(
-    "model",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
-)
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
-)
-def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        response_format=input_output["input"]["response_format"],
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    maybe_json_content = response.choices[0].message.content
-
-    validate_structured_output(maybe_json_content, input_output["output"])
-
-
-@pytest.mark.parametrize(
-    "model",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
-)
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
-)
-def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        response_format=input_output["input"]["response_format"],
-        stream=True,
-    )
-    maybe_json_content = ""
-    for chunk in response:
-        maybe_json_content += chunk.choices[0].delta.content or ""
-    validate_structured_output(maybe_json_content, input_output["output"])
-
-
-@pytest.mark.parametrize(
-    "model",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["model"],
-)
-@pytest.mark.parametrize(
-    "input_output",
-    chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"],
-)
-def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):
-    response = openai_client.chat.completions.create(
-        model=correct_model_name,
-        messages=input_output["input"]["messages"],
-        tools=input_output["input"]["tools"],
-        stream=False,
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    assert len(response.choices[0].message.tool_calls) > 0
-    assert input_output["output"] == "get_weather_tool_call"
-    assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
-    # TODO: add detailed type validation
-
-
-def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
-    if schema_name == "valid_calendar_event":
-
-        class CalendarEvent(BaseModel):
-            name: str
-            date: str
-            participants: list[str]
-
-        try:
-            calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
-            return calendar_event
-        except Exception:
-            return None
-    elif schema_name == "valid_math_reasoning":
-
-        class Step(BaseModel):
-            explanation: str
-            output: str
-
-        class MathReasoning(BaseModel):
-            steps: list[Step]
-            final_answer: str
-
-        try:
-            math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
-            return math_reasoning
-        except Exception:
-            return None
-
-    return None
-
-
-def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
-    structured_output = get_structured_output(maybe_json_content, schema_name)
-    assert structured_output is not None
-    if schema_name == "valid_calendar_event":
-        assert structured_output.name is not None
-        assert structured_output.date is not None
-        assert len(structured_output.participants) == 2
-    elif schema_name == "valid_math_reasoning":
-        assert len(structured_output.final_answer) > 0
--- a/tests/verifications/openai_api/init.py
+++ b/tests/verifications/openai_api/init.py
--- a/tests/verifications/openai_api/fixtures/init.py
+++ b/tests/verifications/openai_api/fixtures/init.py
--- a/tests/verifications/openai_api/fixtures/fixtures.py
+++ b/tests/verifications/openai_api/fixtures/fixtures.py
@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from pathlib import Path
+
+import pytest
+import yaml
+from openai import OpenAI
+
+
+# --- Helper Function to Load Config ---
+def _load_all_verification_configs():
+    """Load and aggregate verification configs from the conf/ directory."""
+    # Note: Path is relative to *this* file (fixtures.py)
+    conf_dir = Path(__file__).parent.parent.parent / "conf"
+    if not conf_dir.is_dir():
+        # Use pytest.fail if called during test collection, otherwise raise error
+        # For simplicity here, we'll raise an error, assuming direct calls
+        # are less likely or can handle it.
+        raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
+
+    all_provider_configs = {}
+    yaml_files = list(conf_dir.glob("*.yaml"))
+    if not yaml_files:
+        raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
+
+    for config_path in yaml_files:
+        provider_name = config_path.stem
+        try:
+            with open(config_path, "r") as f:
+                provider_config = yaml.safe_load(f)
+                if provider_config:
+                    all_provider_configs[provider_name] = provider_config
+                else:
+                    # Log warning if possible, or just skip empty files silently
+                    print(f"Warning: Config file {config_path} is empty or invalid.")
+        except Exception as e:
+            raise IOError(f"Error loading config file {config_path}: {e}") from e
+
+    return {"providers": all_provider_configs}
+
+
+# --- End Helper Function ---
+
+
+@pytest.fixture(scope="session")
+def verification_config():
+    """Pytest fixture to provide the loaded verification config."""
+    try:
+        return _load_all_verification_configs()
+    except (FileNotFoundError, IOError) as e:
+        pytest.fail(str(e))  # Fail test collection if config loading fails
+
+
+@pytest.fixture
+def provider(request, verification_config):
+    provider = request.config.getoption("--provider")
+    base_url = request.config.getoption("--base-url")
+
+    if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
+        raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
+
+    if not provider:
+        if not base_url:
+            raise ValueError("Provider and base URL are not provided")
+        for provider, metadata in verification_config["providers"].items():
+            if metadata["base_url"] == base_url:
+                provider = provider
+                break
+
+    return provider
+
+
+@pytest.fixture
+def base_url(request, provider, verification_config):
+    return request.config.getoption("--base-url") or verification_config["providers"][provider]["base_url"]
+
+
+@pytest.fixture
+def api_key(request, provider, verification_config):
+    provider_conf = verification_config.get("providers", {}).get(provider, {})
+    api_key_env_var = provider_conf.get("api_key_var")
+
+    key_from_option = request.config.getoption("--api-key")
+    key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
+
+    final_key = key_from_option or key_from_env
+    return final_key
+
+
+@pytest.fixture
+def model_mapping(provider, providers_model_mapping):
+    return providers_model_mapping[provider]
+
+
+@pytest.fixture
+def openai_client(base_url, api_key):
+    return OpenAI(
+        base_url=base_url,
+        api_key=api_key,
+    )
--- a/tests/verifications/openai_api/fixtures/load.py
+++ b/tests/verifications/openai_api/fixtures/load.py
--- a/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml
@ -1,31 +1,24 @@
 test_chat_basic:
  test_name: test_chat_basic
  test_params:
-    input_output:
-    - input:
+    case:
+    - case_id: "earth"
+      input:
        messages:
        - content: Which planet do humans live on?
          role: user
      output: Earth
-    - input:
+    - case_id: "saturn"
+      input:
        messages:
        - content: Which planet has rings around it with a name starting with letter
            S?
          role: user
      output: Saturn
-    model:
-    - Llama-3.3-8B-Instruct
-    - Llama-3.3-70B-Instruct
-    - Llama-4-Scout-17B-16E
-    - Llama-4-Scout-17B-16E-Instruct
-    - Llama-4-Maverick-17B-128E
-    - Llama-4-Maverick-17B-128E-Instruct
-    - gpt-4o
-    - gpt-4o-mini
 test_chat_image:
  test_name: test_chat_image
  test_params:
-    input_output:
+    case:
    - input:
        messages:
        - content:
@ -36,18 +29,12 @@ test_chat_image:
            type: image_url
          role: user
      output: llama
-    model:
-    - Llama-4-Scout-17B-16E
-    - Llama-4-Scout-17B-16E-Instruct
-    - Llama-4-Maverick-17B-128E
-    - Llama-4-Maverick-17B-128E-Instruct
-    - gpt-4o
-    - gpt-4o-mini
 test_chat_structured_output:
  test_name: test_chat_structured_output
  test_params:
-    input_output:
-    - input:
+    case:
+    - case_id: "calendar"
+      input:
        messages:
        - content: Extract the event information.
          role: system
@ -77,7 +64,8 @@ test_chat_structured_output:
              type: object
          type: json_schema
      output: valid_calendar_event
-    - input:
+    - case_id: "math"
+      input:
        messages:
        - content: You are a helpful math tutor. Guide the user through the solution
            step by step.
@ -118,19 +106,10 @@ test_chat_structured_output:
              type: object
          type: json_schema
      output: valid_math_reasoning
-    model:
-    - Llama-3.3-8B-Instruct
-    - Llama-3.3-70B-Instruct
-    - Llama-4-Scout-17B-16E
-    - Llama-4-Scout-17B-16E-Instruct
-    - Llama-4-Maverick-17B-128E
-    - Llama-4-Maverick-17B-128E-Instruct
-    - gpt-4o
-    - gpt-4o-mini
 test_tool_calling:
  test_name: test_tool_calling
  test_params:
-    input_output:
+    case:
    - input:
        messages:
        - content: You are a helpful assistant that can use tools to get information.
@ -152,11 +131,3 @@ test_tool_calling:
              type: object
          type: function
      output: get_weather_tool_call
-    model:
-    - Llama-3.3-70B-Instruct
-    - Llama-4-Scout-17B-16E
-    - Llama-4-Scout-17B-16E-Instruct
-    - Llama-4-Maverick-17B-128E
-    - Llama-4-Maverick-17B-128E-Instruct
-    - gpt-4o
-    - gpt-4o-mini
--- a/tests/verifications/openai_api/test_chat_completion.py
+++ b/tests/verifications/openai_api/test_chat_completion.py
@ -0,0 +1,326 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any
+
+import pytest
+from pydantic import BaseModel
+
+from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
+from tests.verifications.openai_api.fixtures.load import load_test_cases
+
+chat_completion_test_cases = load_test_cases("chat_completion")
+
+
+def case_id_generator(case):
+    """Generate a test ID from the case's 'case_id' field, or use a default."""
+    case_id = case.get("case_id")
+    if isinstance(case_id, (str, int)):
+        return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
+    return None
+
+
+def pytest_generate_tests(metafunc):
+    """Dynamically parametrize tests based on the selected provider and config."""
+    if "model" in metafunc.fixturenames:
+        provider = metafunc.config.getoption("provider")
+        if not provider:
+            print("Warning: --provider not specified. Skipping model parametrization.")
+            metafunc.parametrize("model", [])
+            return
+
+        try:
+            config_data = _load_all_verification_configs()
+        except (FileNotFoundError, IOError) as e:
+            print(f"ERROR loading verification configs: {e}")
+            config_data = {"providers": {}}
+
+        provider_config = config_data.get("providers", {}).get(provider)
+        if provider_config:
+            models = provider_config.get("models", [])
+            if models:
+                metafunc.parametrize("model", models)
+            else:
+                print(f"Warning: No models found for provider '{provider}' in config.")
+                metafunc.parametrize("model", [])  # Parametrize empty if no models found
+        else:
+            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
+            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
+
+
+def should_skip_test(verification_config, provider, model, test_name_base):
+    """Check if a test should be skipped based on config exclusions."""
+    provider_config = verification_config.get("providers", {}).get(provider)
+    if not provider_config:
+        return False  # No config for provider, don't skip
+
+    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
+    return test_name_base in exclusions
+
+
+# Helper to get the base test name from the request object
+def get_base_test_name(request):
+    return request.node.originalname
+
+
+# --- Test Functions ---
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        stream=False,
+    )
+    assert response.choices[0].message.role == "assistant"
+    assert case["output"].lower() in response.choices[0].message.content.lower()
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        stream=True,
+    )
+    content = ""
+    for chunk in response:
+        content += chunk.choices[0].delta.content or ""
+
+    # TODO: add detailed type validation
+
+    assert case["output"].lower() in content.lower()
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        stream=False,
+    )
+    assert response.choices[0].message.role == "assistant"
+    assert case["output"].lower() in response.choices[0].message.content.lower()
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        stream=True,
+    )
+    content = ""
+    for chunk in response:
+        content += chunk.choices[0].delta.content or ""
+
+    # TODO: add detailed type validation
+
+    assert case["output"].lower() in content.lower()
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        response_format=case["input"]["response_format"],
+        stream=False,
+    )
+
+    assert response.choices[0].message.role == "assistant"
+    maybe_json_content = response.choices[0].message.content
+
+    validate_structured_output(maybe_json_content, case["output"])
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        response_format=case["input"]["response_format"],
+        stream=True,
+    )
+    maybe_json_content = ""
+    for chunk in response:
+        maybe_json_content += chunk.choices[0].delta.content or ""
+    validate_structured_output(maybe_json_content, case["output"])
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        tools=case["input"]["tools"],
+        stream=False,
+    )
+
+    assert response.choices[0].message.role == "assistant"
+    assert len(response.choices[0].message.tool_calls) > 0
+    assert case["output"] == "get_weather_tool_call"
+    assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
+    # TODO: add detailed type validation
+
+
+@pytest.mark.parametrize(
+    "case",
+    chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    stream = openai_client.chat.completions.create(
+        model=model,
+        messages=case["input"]["messages"],
+        tools=case["input"]["tools"],
+        stream=True,
+    )
+
+    # Accumulate partial tool_calls here
+    tool_calls_buffer = {}
+    current_id = None
+    # Process streaming chunks
+    for chunk in stream:
+        choice = chunk.choices[0]
+        delta = choice.delta
+
+        if delta.tool_calls is None:
+            continue
+
+        for tool_call_delta in delta.tool_calls:
+            if tool_call_delta.id:
+                current_id = tool_call_delta.id
+            call_id = current_id
+            func_delta = tool_call_delta.function
+
+            if call_id not in tool_calls_buffer:
+                tool_calls_buffer[call_id] = {
+                    "id": call_id,
+                    "type": tool_call_delta.type,
+                    "name": func_delta.name,
+                    "arguments": "",
+                }
+
+            if func_delta.arguments:
+                tool_calls_buffer[call_id]["arguments"] += func_delta.arguments
+
+    assert len(tool_calls_buffer) == 1
+    for call in tool_calls_buffer.values():
+        assert len(call["id"]) > 0
+        assert call["name"] == "get_weather"
+
+        args_dict = json.loads(call["arguments"])
+        assert "san francisco" in args_dict["location"].lower()
+
+
+# --- Helper functions (structured output validation) ---
+
+
+def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
+    if schema_name == "valid_calendar_event":
+
+        class CalendarEvent(BaseModel):
+            name: str
+            date: str
+            participants: list[str]
+
+        try:
+            calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
+            return calendar_event
+        except Exception:
+            return None
+    elif schema_name == "valid_math_reasoning":
+
+        class Step(BaseModel):
+            explanation: str
+            output: str
+
+        class MathReasoning(BaseModel):
+            steps: list[Step]
+            final_answer: str
+
+        try:
+            math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
+            return math_reasoning
+        except Exception:
+            return None
+
+    return None
+
+
+def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
+    structured_output = get_structured_output(maybe_json_content, schema_name)
+    assert structured_output is not None
+    if schema_name == "valid_calendar_event":
+        assert structured_output.name is not None
+        assert structured_output.date is not None
+        assert len(structured_output.participants) == 2
+    elif schema_name == "valid_math_reasoning":
+        assert len(structured_output.final_answer) > 0
--- a/tests/verifications/test_results/fireworks.json
+++ b/tests/verifications/test_results/fireworks.json
--- a/tests/verifications/test_results/fireworks_1744154308.json
+++ b/tests/verifications/test_results/fireworks_1744154308.json
--- a/tests/verifications/test_results/openai.json
+++ b/tests/verifications/test_results/openai.json
@ -0,0 +1,945 @@
+{
+  "created": 1744328898.0248861,
+  "duration": 47.561042070388794,
+  "exitcode": 0,
+  "root": "/Users/erichuang/projects/llama-stack",
+  "environment": {},
+  "summary": {
+    "passed": 24,
+    "total": 24,
+    "collected": 24
+  },
+  "collectors": [
+    {
+      "nodeid": "",
+      "outcome": "passed",
+      "result": [
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py",
+          "type": "Module"
+        }
+      ]
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py",
+      "outcome": "passed",
+      "result": [
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
+          "type": "Function",
+          "lineno": 73
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
+          "type": "Function",
+          "lineno": 73
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
+          "type": "Function",
+          "lineno": 73
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
+          "type": "Function",
+          "lineno": 73
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
+          "type": "Function",
+          "lineno": 92
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
+          "type": "Function",
+          "lineno": 92
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
+          "type": "Function",
+          "lineno": 92
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
+          "type": "Function",
+          "lineno": 92
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
+          "type": "Function",
+          "lineno": 116
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
+          "type": "Function",
+          "lineno": 116
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
+          "type": "Function",
+          "lineno": 135
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
+          "type": "Function",
+          "lineno": 135
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
+          "type": "Function",
+          "lineno": 159
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
+          "type": "Function",
+          "lineno": 159
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
+          "type": "Function",
+          "lineno": 159
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
+          "type": "Function",
+          "lineno": 159
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
+          "type": "Function",
+          "lineno": 182
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
+          "type": "Function",
+          "lineno": 182
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
+          "type": "Function",
+          "lineno": 182
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
+          "type": "Function",
+          "lineno": 182
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
+          "type": "Function",
+          "lineno": 204
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
+          "type": "Function",
+          "lineno": 204
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]",
+          "type": "Function",
+          "lineno": 228
+        },
+        {
+          "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
+          "type": "Function",
+          "lineno": 228
+        }
+      ]
+    }
+  ],
+  "tests": [
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
+      "lineno": 73,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_basic[gpt-4o-earth]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-earth",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "earth"
+      },
+      "setup": {
+        "duration": 0.0694252080284059,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.5709165419684723,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0007626248989254236,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
+      "lineno": 73,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_basic[gpt-4o-saturn]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-saturn",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "saturn"
+      },
+      "setup": {
+        "duration": 0.010281750001013279,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.6309260830748826,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0001824579667299986,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
+      "lineno": 73,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_basic[gpt-4o-mini-earth]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-earth",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "earth"
+      },
+      "setup": {
+        "duration": 0.007922374992631376,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.31756504194345325,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0005268750246614218,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
+      "lineno": 73,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-saturn",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "saturn"
+      },
+      "setup": {
+        "duration": 0.01643404201604426,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.7479908330133185,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0004037501057609916,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
+      "lineno": 92,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_basic[gpt-4o-earth]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-earth",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "earth"
+      },
+      "setup": {
+        "duration": 0.021671707974746823,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.6701172919711098,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0005569590721279383,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
+      "lineno": 92,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_basic[gpt-4o-saturn]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-saturn",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "saturn"
+      },
+      "setup": {
+        "duration": 0.015847125090658665,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.636536999954842,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00029395800083875656,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
+      "lineno": 92,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_basic[gpt-4o-mini-earth]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-earth",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "earth"
+      },
+      "setup": {
+        "duration": 0.011792832985520363,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.5610962919890881,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0003578749019652605,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
+      "lineno": 92,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_basic[gpt-4o-mini-saturn]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-saturn",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "saturn"
+      },
+      "setup": {
+        "duration": 0.016500207944773138,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.8060244580265135,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0005296670133247972,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
+      "lineno": 116,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_image[gpt-4o-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.008338792016729712,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 7.009252917021513,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0003042910248041153,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
+      "lineno": 116,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_image[gpt-4o-mini-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.007238540914840996,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 3.134693874977529,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0003104590578004718,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
+      "lineno": 135,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_image[gpt-4o-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.0161851670127362,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 3.0745719589758664,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00022620800882577896,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
+      "lineno": 135,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_image[gpt-4o-mini-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.013220708002336323,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 3.624867417034693,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00020633300300687551,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
+      "lineno": 159,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_structured_output[gpt-4o-calendar]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-calendar",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "calendar"
+      },
+      "setup": {
+        "duration": 0.017596833989955485,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 1.248568250099197,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0004248750628903508,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
+      "lineno": 159,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_structured_output[gpt-4o-math]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-math",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "math"
+      },
+      "setup": {
+        "duration": 0.01512012502644211,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 8.170285542029887,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00043537491001188755,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
+      "lineno": 159,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-calendar",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "calendar"
+      },
+      "setup": {
+        "duration": 0.010376665974035859,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.756480542011559,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00025695806834846735,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
+      "lineno": 159,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-math",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "math"
+      },
+      "setup": {
+        "duration": 0.006846625008620322,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 2.6833953330060467,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00022558309137821198,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
+      "lineno": 182,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_structured_output[gpt-4o-calendar]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-calendar",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "calendar"
+      },
+      "setup": {
+        "duration": 0.009646040969528258,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.6117532079806551,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00015258300118148327,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
+      "lineno": 182,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_structured_output[gpt-4o-math]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-math",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "math"
+      },
+      "setup": {
+        "duration": 0.012024458032101393,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 4.522625041077845,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0004230838967487216,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
+      "lineno": 182,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-calendar",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "calendar"
+      },
+      "setup": {
+        "duration": 0.009566582972183824,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 2.5591942919418216,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0007555419579148293,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
+      "lineno": 182,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_structured_output[gpt-4o-mini-math]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-math",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "math"
+      },
+      "setup": {
+        "duration": 0.010828875005245209,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 2.495122667052783,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0002802090020850301,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
+      "lineno": 204,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_tool_calling[gpt-4o-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.012762792059220374,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.5655921660363674,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.00022304197773337364,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
+      "lineno": 204,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.03188708401285112,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.6159415419679135,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0005549580091610551,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]",
+      "lineno": 228,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_tool_calling[gpt-4o-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.014768208027817309,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.47373537498060614,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0005811670562252402,
+        "outcome": "passed"
+      }
+    },
+    {
+      "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
+      "lineno": 228,
+      "outcome": "passed",
+      "keywords": [
+        "test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
+        "parametrize",
+        "pytestmark",
+        "gpt-4o-mini-case0",
+        "test_chat_completion.py",
+        "openai_api",
+        "verifications",
+        "tests",
+        "llama-stack",
+        ""
+      ],
+      "metadata": {
+        "model": "gpt-4o-mini",
+        "case_id": "case0"
+      },
+      "setup": {
+        "duration": 0.010271625011228025,
+        "outcome": "passed"
+      },
+      "call": {
+        "duration": 0.5656027499353513,
+        "outcome": "passed"
+      },
+      "teardown": {
+        "duration": 0.0025699170073494315,
+        "outcome": "passed"
+      }
+    }
+  ],
+  "run_timestamp": 1744328848
+}
--- a/tests/verifications/test_results/openai_1744154522.json
+++ b/tests/verifications/test_results/openai_1744154522.json
--- a/tests/verifications/test_results/together.json
+++ b/tests/verifications/test_results/together.json
--- a/tests/verifications/test_results/together_1744154399.json
+++ b/tests/verifications/test_results/together_1744154399.json