chore: remove pytest reports (#2156)

# What does this PR do? Cleanup old test code too. Signed-off-by: Sébastien Han <seb@redhat.com>
2025-05-14 07:40:15 +02:00 · 2025-05-14 07:40:15 +02:00 · 26dffff92a
commit 26dffff92a
parent 8e316c9b1e
13 changed files with 0 additions and 1032 deletions
--- a/llama_stack/providers/tests/init.py
+++ b/llama_stack/providers/tests/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/llama_stack/providers/tests/ci_test_config.yaml
+++ b/llama_stack/providers/tests/ci_test_config.yaml
@ -1,55 +0,0 @@
 inference:
  tests:
  - inference/test_vision_inference.py::test_vision_chat_completion_streaming
  - inference/test_vision_inference.py::test_vision_chat_completion_non_streaming
  - inference/test_text_inference.py::test_structured_output
  - inference/test_text_inference.py::test_chat_completion_streaming
  - inference/test_text_inference.py::test_chat_completion_non_streaming
  - inference/test_text_inference.py::test_chat_completion_with_tool_calling
  - inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming
  scenarios:
  - provider_fixtures:
      inference: ollama
  - fixture_combo_id: fireworks
  - provider_fixtures:
      inference: together
    # - inference: tgi
    # - inference: vllm_remote
  inference_models:
  - meta-llama/Llama-3.1-8B-Instruct
  - meta-llama/Llama-3.2-11B-Vision-Instruct
 agents:
  tests:
   - agents/test_agents.py::test_agent_turns_with_safety
   - agents/test_agents.py::test_rag_agent
  scenarios:
  - fixture_combo_id: ollama
  - fixture_combo_id: together
  - fixture_combo_id: fireworks
  inference_models:
  - meta-llama/Llama-3.2-1B-Instruct
  safety_shield: meta-llama/Llama-Guard-3-1B
 memory:
  tests:
   - memory/test_memory.py::test_query_documents
  scenarios:
  - fixture_combo_id: ollama
  - provider_fixtures:
      inference: sentence_transformers
      memory: faiss
  - fixture_combo_id: chroma
  inference_models:
  - meta-llama/Llama-3.2-1B-Instruct
  embedding_model: all-MiniLM-L6-v2
--- a/llama_stack/providers/tests/conftest.py
+++ b/llama_stack/providers/tests/conftest.py
@ -1,296 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from collections import defaultdict
 from pathlib import Path
 from typing import Any
 import pytest
 import yaml
 from dotenv import load_dotenv
 from pydantic import BaseModel, Field
 from termcolor import colored
 from llama_stack.distribution.datatypes import Provider
 from llama_stack.providers.datatypes import RemoteProviderConfig
 from .env import get_env_or_fail
 from .report import Report
 class ProviderFixture(BaseModel):
    providers: list[Provider]
    provider_data: dict[str, Any] | None = None
 class TestScenario(BaseModel):
    # provider fixtures can be either a mark or a dictionary of api -> providers
    provider_fixtures: dict[str, str] = Field(default_factory=dict)
    fixture_combo_id: str | None = None
 class APITestConfig(BaseModel):
    scenarios: list[TestScenario] = Field(default_factory=list)
    inference_models: list[str] = Field(default_factory=list)
    # test name format should be <relative_path.py>::<test_name>
    tests: list[str] = Field(default_factory=list)
 class MemoryApiTestConfig(APITestConfig):
    embedding_model: str | None = Field(default_factory=None)
 class AgentsApiTestConfig(APITestConfig):
    safety_shield: str | None = Field(default_factory=None)
 class TestConfig(BaseModel):
    inference: APITestConfig | None = None
    agents: AgentsApiTestConfig | None = None
    memory: MemoryApiTestConfig | None = None
 def get_test_config_from_config_file(metafunc_config):
    config_file = metafunc_config.getoption("--config")
    if config_file is None:
        return None
    config_file_path = Path(__file__).parent / config_file
    if not config_file_path.exists():
        raise ValueError(
            f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory."
        )
    with open(config_file_path) as config_file:
        config = yaml.safe_load(config_file)
        return TestConfig(**config)
 def get_test_config_for_api(metafunc_config, api):
    test_config = get_test_config_from_config_file(metafunc_config)
    if test_config is None:
        return None
    return getattr(test_config, api)
 def get_provider_fixture_overrides_from_test_config(metafunc_config, api, default_provider_fixture_combinations):
    api_config = get_test_config_for_api(metafunc_config, api)
    if api_config is None:
        return None
    fixture_combo_ids = set()
    custom_provider_fixture_combos = []
    for scenario in api_config.scenarios:
        if scenario.fixture_combo_id:
            fixture_combo_ids.add(scenario.fixture_combo_id)
        else:
            custom_provider_fixture_combos.append(
                pytest.param(
                    scenario.provider_fixtures,
                    id=scenario.provider_fixtures.get("inference") or "",
                )
            )
    if len(fixture_combo_ids) > 0:
        for default_fixture in default_provider_fixture_combinations:
            if default_fixture.id in fixture_combo_ids:
                custom_provider_fixture_combos.append(default_fixture)
    return custom_provider_fixture_combos
 def remote_stack_fixture() -> ProviderFixture:
    if url := os.getenv("REMOTE_STACK_URL", None):
        config = RemoteProviderConfig.from_url(url)
    else:
        config = RemoteProviderConfig(
            host=get_env_or_fail("REMOTE_STACK_HOST"),
            port=int(get_env_or_fail("REMOTE_STACK_PORT")),
        )
    return ProviderFixture(
        providers=[
            Provider(
                provider_id="test::remote",
                provider_type="test::remote",
                config=config.model_dump(),
            )
        ],
    )
 def pytest_configure(config):
    config.option.tbstyle = "short"
    config.option.disable_warnings = True
    """Load environment variables at start of test run"""
    # Load from .env file if it exists
    env_file = Path(__file__).parent / ".env"
    if env_file.exists():
        load_dotenv(env_file)
    # Load any environment variables passed via --env
    env_vars = config.getoption("--env") or []
    for env_var in env_vars:
        key, value = env_var.split("=", 1)
        os.environ[key] = value
    if config.getoption("--output") is not None:
        config.pluginmanager.register(Report(config.getoption("--output")))
 def pytest_addoption(parser):
    parser.addoption(
        "--providers",
        default="",
        help=(
            "Provider configuration in format: api1=provider1,api2=provider2. "
            "Example: --providers inference=ollama,safety=meta-reference"
        ),
    )
    parser.addoption(
        "--config",
        action="store",
        help="Set test config file (supported format: YAML), e.g. --config=test_config.yml",
    )
    parser.addoption(
        "--output",
        action="store",
        help="Set output file for test report, e.g. --output=pytest_report.md",
    )
    """Add custom command line options"""
    parser.addoption("--env", action="append", help="Set environment variables, e.g. --env KEY=value")
    parser.addoption(
        "--inference-model",
        action="store",
        default="meta-llama/Llama-3.2-3B-Instruct",
        help="Specify the inference model to use for testing",
    )
    parser.addoption(
        "--safety-shield",
        action="store",
        default="meta-llama/Llama-Guard-3-1B",
        help="Specify the safety shield to use for testing",
    )
    parser.addoption(
        "--embedding-model",
        action="store",
        default=None,
        help="Specify the embedding model to use for testing",
    )
    parser.addoption(
        "--judge-model",
        action="store",
        default="meta-llama/Llama-3.1-8B-Instruct",
        help="Specify the judge model to use for testing",
    )
 def make_provider_id(providers: dict[str, str]) -> str:
    return ":".join(f"{api}={provider}" for api, provider in sorted(providers.items()))
 def get_provider_marks(providers: dict[str, str]) -> list[Any]:
    marks = []
    for provider in providers.values():
        marks.append(getattr(pytest.mark, provider))
    return marks
 def get_provider_fixture_overrides(config, available_fixtures: dict[str, list[str]]) -> list[pytest.param] | None:
    provider_str = config.getoption("--providers")
    if not provider_str:
        return None
    fixture_dict = parse_fixture_string(provider_str, available_fixtures)
    return [
        pytest.param(
            fixture_dict,
            id=make_provider_id(fixture_dict),
            marks=get_provider_marks(fixture_dict),
        )
    ]
 def parse_fixture_string(provider_str: str, available_fixtures: dict[str, list[str]]) -> dict[str, str]:
    """Parse provider string of format 'api1=provider1,api2=provider2'"""
    if not provider_str:
        return {}
    fixtures = {}
    pairs = provider_str.split(",")
    for pair in pairs:
        if "=" not in pair:
            raise ValueError(f"Invalid provider specification: {pair}. Expected format: api=provider")
        api, fixture = pair.split("=")
        if api not in available_fixtures:
            raise ValueError(f"Unknown API: {api}. Available APIs: {list(available_fixtures.keys())}")
        if fixture not in available_fixtures[api]:
            raise ValueError(
                f"Unknown provider '{fixture}' for API '{api}'. Available providers: {list(available_fixtures[api])}"
            )
        fixtures[api] = fixture
    # Check that all provided APIs are supported
    for api in available_fixtures.keys():
        if api not in fixtures:
            raise ValueError(
                f"Missing provider fixture for API '{api}'. Available providers: {list(available_fixtures[api])}"
            )
    return fixtures
 def pytest_itemcollected(item):
    # Get all markers as a list
    filtered = ("asyncio", "parametrize")
    marks = [mark.name for mark in item.iter_markers() if mark.name not in filtered]
    if marks:
        marks = colored(",".join(marks), "yellow")
        item.name = f"{item.name}[{marks}]"
 def pytest_collection_modifyitems(session, config, items):
    test_config = get_test_config_from_config_file(config)
    if test_config is None:
        return
    required_tests = defaultdict(set)
    for api_test_config in [
        test_config.inference,
        test_config.memory,
        test_config.agents,
    ]:
        if api_test_config is None:
            continue
        for test in api_test_config.tests:
            arr = test.split("::")
            if len(arr) != 2:
                raise ValueError(f"Invalid format for test name {test}")
            test_path, func_name = arr
            required_tests[Path(__file__).parent / test_path].add(func_name)
    new_items, deselected_items = [], []
    for item in items:
        func_name = getattr(item, "originalname", item.name)
        if func_name in required_tests[item.fspath]:
            new_items.append(item)
            continue
        deselected_items.append(item)
    items[:] = new_items
    config.hook.pytest_deselected(items=deselected_items)
 pytest_plugins = [
    "llama_stack.providers.tests.inference.fixtures",
    "llama_stack.providers.tests.safety.fixtures",
    "llama_stack.providers.tests.vector_io.fixtures",
    "llama_stack.providers.tests.agents.fixtures",
    "llama_stack.providers.tests.datasetio.fixtures",
    "llama_stack.providers.tests.scoring.fixtures",
    "llama_stack.providers.tests.eval.fixtures",
    "llama_stack.providers.tests.post_training.fixtures",
    "llama_stack.providers.tests.tools.fixtures",
 ]
--- a/llama_stack/providers/tests/report.py
+++ b/llama_stack/providers/tests/report.py
@ -1,176 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections import defaultdict
 from pathlib import Path
 import pytest
 from pytest import ExitCode
 from pytest_html.basereport import _process_outcome
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.models.llama.sku_types import CoreModelId
 INFERENCE_APIS = ["chat_completion"]
 FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
 SUPPORTED_MODELS = {
    "ollama": {
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_2_1b_instruct.value,
        CoreModelId.llama3_2_1b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_3_70b_instruct.value,
        CoreModelId.llama_guard_3_8b.value,
        CoreModelId.llama_guard_3_1b.value,
    },
    "fireworks": {
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_2_1b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_3_70b_instruct.value,
        CoreModelId.llama_guard_3_8b.value,
        CoreModelId.llama_guard_3_11b_vision.value,
    },
    "together": {
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_3_70b_instruct.value,
        CoreModelId.llama_guard_3_8b.value,
        CoreModelId.llama_guard_3_11b_vision.value,
    },
 }
 class Report:
    def __init__(self, output_path):
        valid_file_format = (
            output_path.split(".")[1] in ["md", "markdown"] if len(output_path.split(".")) == 2 else False
        )
        if not valid_file_format:
            raise ValueError(f"Invalid output file {output_path}. Markdown file is required")
        self.output_path = output_path
        self.test_data = defaultdict(dict)
        self.inference_tests = defaultdict(dict)
    @pytest.hookimpl
    def pytest_runtest_logreport(self, report):
        # This hook is called in several phases, including setup, call and teardown
        # The test is considered failed / error if any of the outcomes is not "Passed"
        outcome = _process_outcome(report)
        data = {
            "outcome": report.outcome,
            "longrepr": report.longrepr,
            "name": report.nodeid,
        }
        if report.nodeid not in self.test_data:
            self.test_data[report.nodeid] = data
        elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
            self.test_data[report.nodeid] = data
    @pytest.hookimpl
    def pytest_sessionfinish(self, session, exitstatus):
        if exitstatus <= ExitCode.INTERRUPTED:
            return
        report = []
        report.append("# Llama Stack Integration Test Results Report")
        report.append("\n## Summary")
        report.append("\n## Supported Models: ")
        header = "| Model Descriptor |"
        dividor = "|:---|"
        for k in SUPPORTED_MODELS.keys():
            header += f"{k} |"
            dividor += ":---:|"
        report.append(header)
        report.append(dividor)
        rows = []
        for model in all_registered_models():
            if "Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value:
                continue
            row = f"| {model.core_model_id.value} |"
            for k in SUPPORTED_MODELS.keys():
                if model.core_model_id.value in SUPPORTED_MODELS[k]:
                    row += " ✅ |"
                else:
                    row += " ❌ |"
            rows.append(row)
        report.extend(rows)
        report.append("\n### Tests:")
        for provider in SUPPORTED_MODELS.keys():
            if provider not in self.inference_tests:
                continue
            report.append(f"\n #### {provider}")
            test_table = [
                "| Area | Model | API | Functionality Test | Status |",
                "|:-----|:-----|:-----|:-----|:-----|",
            ]
            for api in INFERENCE_APIS:
                tests = self.inference_tests[provider][api]
                for test_nodeid in tests:
                    row = "|{area} | {model} | {api} | {test} | {result} ".format(
                        area="Text" if "text" in test_nodeid else "Vision",
                        model=("Llama-3.1-8B-Instruct" if "text" in test_nodeid else "Llama3.2-11B-Vision-Instruct"),
                        api=f"/{api}",
                        test=self.get_simple_function_name(test_nodeid),
                        result=("✅" if self.test_data[test_nodeid]["outcome"] == "passed" else "❌"),
                    )
                    test_table += [row]
            report.extend(test_table)
            report.append("\n")
        output_file = Path(self.output_path)
        output_file.write_text("\n".join(report))
        print(f"\n Report generated: {output_file.absolute()}")
    @pytest.hookimpl(trylast=True)
    def pytest_collection_modifyitems(self, session, config, items):
        for item in items:
            inference = item.callspec.params.get("inference_stack")
            if "inference" in item.nodeid:
                func_name = getattr(item, "originalname", item.name)
                for api in INFERENCE_APIS:
                    if api in func_name:
                        api_tests = self.inference_tests[inference].get(api, set())
                        api_tests.add(item.nodeid)
                        self.inference_tests[inference][api] = api_tests
    def get_simple_function_name(self, nodeid):
        """Extract function name from nodeid.
        Examples:
        - 'tests/test_math.py::test_addition' -> 'test_addition'
        - 'tests/test_math.py::TestClass::test_method' -> test_method'
        """
        parts = nodeid.split("::")
        func_name = nodeid  # Fallback to full nodeid if pattern doesn't match
        if len(parts) == 2:  # Simple function
            func_name = parts[1]
        elif len(parts) == 3:  # Class method
            func_name = parts[2]
        return func_name.split("[")[0]
--- a/llama_stack/templates/cerebras/report.md
+++ b/llama_stack/templates/cerebras/report.md
@ -1,43 +0,0 @@
 # Report for cerebras distribution
 ## Supported Models
 | Model Descriptor | cerebras |
 |:---|:---|
 | meta-llama/Llama-3-8B-Instruct | ❌ |
 | meta-llama/Llama-3-70B-Instruct | ❌ |
 | meta-llama/Llama-3.1-8B-Instruct | ✅ |
 | meta-llama/Llama-3.1-70B-Instruct | ❌ |
 | meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ |
 | meta-llama/Llama-3.2-1B-Instruct | ❌ |
 | meta-llama/Llama-3.2-3B-Instruct | ❌ |
 | meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ |
 | meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ |
 | meta-llama/Llama-3.3-70B-Instruct | ✅ |
 | meta-llama/Llama-Guard-3-11B-Vision | ❌ |
 | meta-llama/Llama-Guard-3-1B | ❌ |
 | meta-llama/Llama-Guard-3-8B | ❌ |
 | meta-llama/Llama-Guard-2-8B | ❌ |
 ## Inference
 | Model | API | Capability | Test | Status |
 |:----- |:-----|:-----|:-----|:-----|
 | Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
 | Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ❌ |
 ## Vector IO
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /retrieve |  | test_vector_db_retrieve | ✅ |
 ## Agents
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /create_agent_turn | rag | test_rag_agent | ✅ |
 | /create_agent_turn | custom_tool | test_custom_tool | ❌ |
--- a/llama_stack/templates/fireworks/report.md
+++ b/llama_stack/templates/fireworks/report.md
@ -1,45 +0,0 @@
 # Report for fireworks distribution
 ## Supported Models
 | Model Descriptor | fireworks |
 |:---|:---|
 | Llama-3-8B-Instruct | ❌ |
 | Llama-3-70B-Instruct | ❌ |
 | Llama3.1-8B-Instruct | ✅ |
 | Llama3.1-70B-Instruct | ✅ |
 | Llama3.1-405B-Instruct | ✅ |
 | Llama3.2-1B-Instruct | ✅ |
 | Llama3.2-3B-Instruct | ✅ |
 | Llama3.2-11B-Vision-Instruct | ✅ |
 | Llama3.2-90B-Vision-Instruct | ✅ |
 | Llama3.3-70B-Instruct | ✅ |
 | Llama-Guard-3-11B-Vision | ✅ |
 | Llama-Guard-3-1B | ❌ |
 | Llama-Guard-3-8B | ✅ |
 | Llama-Guard-2-8B | ❌ |
 ## Inference
 | Model | API | Capability | Test | Status |
 |:----- |:-----|:-----|:-----|:-----|
 | Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
 ## Vector IO
 | Provider | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|:-----|
 | inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
 ## Agents
 | Provider | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|:-----|
 | inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
 | inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
--- a/llama_stack/templates/ollama/report.md
+++ b/llama_stack/templates/ollama/report.md
@ -1,43 +0,0 @@
 # Report for ollama distribution
 ## Supported Models
 | Model Descriptor | ollama |
 |:---|:---|
 | Llama-3-8B-Instruct | ❌ |
 | Llama-3-70B-Instruct | ❌ |
 | Llama3.1-8B-Instruct | ✅ |
 | Llama3.1-70B-Instruct | ✅ |
 | Llama3.1-405B-Instruct | ✅ |
 | Llama3.2-1B-Instruct | ✅ |
 | Llama3.2-3B-Instruct | ✅ |
 | Llama3.2-11B-Vision-Instruct | ✅ |
 | Llama3.2-90B-Vision-Instruct | ✅ |
 | Llama3.3-70B-Instruct | ✅ |
 | Llama-Guard-3-11B-Vision | ❌ |
 | Llama-Guard-3-1B | ✅ |
 | Llama-Guard-3-8B | ✅ |
 | Llama-Guard-2-8B | ❌ |
 ## Inference
 | Model | API | Capability | Test | Status |
 |:----- |:-----|:-----|:-----|:-----|
 | Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
 | Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
 ## Vector IO
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /retrieve |  | test_vector_db_retrieve | ✅ |
 ## Agents
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /create_agent_turn | rag | test_rag_agent | ✅ |
 | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
--- a/llama_stack/templates/tgi/report.md
+++ b/llama_stack/templates/tgi/report.md
@ -1,44 +0,0 @@
 # Report for tgi distribution
 ## Supported Models
 | Model Descriptor | tgi |
 |:---|:---|
 | Llama-3-8B-Instruct | ✅ |
 | Llama-3-70B-Instruct | ✅ |
 | Llama3.1-8B-Instruct | ✅ |
 | Llama3.1-70B-Instruct | ✅ |
 | Llama3.1-405B-Instruct | ✅ |
 | Llama3.2-1B-Instruct | ✅ |
 | Llama3.2-3B-Instruct | ✅ |
 | Llama3.2-11B-Vision-Instruct | ✅ |
 | Llama3.2-90B-Vision-Instruct | ✅ |
 | Llama3.3-70B-Instruct | ✅ |
 | Llama-Guard-3-11B-Vision | ✅ |
 | Llama-Guard-3-1B | ✅ |
 | Llama-Guard-3-8B | ✅ |
 | Llama-Guard-2-8B | ✅ |
 ## Inference
 | Model | API | Capability | Test | Status |
 |:----- |:-----|:-----|:-----|:-----|
 | Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
 | Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
 ## Vector IO
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /retrieve |  | test_vector_db_retrieve | ✅ |
 ## Agents
 | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|
 | /create_agent_turn | rag | test_rag_agent | ✅ |
 | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
 | /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
--- a/llama_stack/templates/together/report.md
+++ b/llama_stack/templates/together/report.md
@ -1,45 +0,0 @@
 # Report for together distribution
 ## Supported Models
 | Model Descriptor | together |
 |:---|:---|
 | Llama-3-8B-Instruct | ❌ |
 | Llama-3-70B-Instruct | ❌ |
 | Llama3.1-8B-Instruct | ✅ |
 | Llama3.1-70B-Instruct | ✅ |
 | Llama3.1-405B-Instruct | ✅ |
 | Llama3.2-1B-Instruct | ❌ |
 | Llama3.2-3B-Instruct | ✅ |
 | Llama3.2-11B-Vision-Instruct | ✅ |
 | Llama3.2-90B-Vision-Instruct | ✅ |
 | Llama3.3-70B-Instruct | ✅ |
 | Llama-Guard-3-11B-Vision | ✅ |
 | Llama-Guard-3-1B | ❌ |
 | Llama-Guard-3-8B | ✅ |
 | Llama-Guard-2-8B | ❌ |
 ## Inference
 | Model | API | Capability | Test | Status |
 |:----- |:-----|:-----|:-----|:-----|
 | Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
 | Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
 | Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
 ## Vector IO
 | Provider | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|:-----|
 | inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
 ## Agents
 | Provider | API | Capability | Test | Status |
 |:-----|:-----|:-----|:-----|:-----|
 | inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
 | inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -28,7 +28,6 @@ if no model is specified.
 Experimental, under development, options:
 - `--record-responses`: record new API responses instead of using cached ones
 - `--report`: path where the test report should be written, e.g. --report=/path/to/report.md
 ## Examples
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -15,8 +15,6 @@ from dotenv import load_dotenv
 from llama_stack.log import get_logger
 from .report import Report
 logger = get_logger(__name__, category="tests")
@ -60,9 +58,6 @@ def pytest_configure(config):
        os.environ["DISABLE_CODE_SANDBOX"] = "1"
        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
    if config.getoption("--report"):
        config.pluginmanager.register(Report(config))
 def pytest_addoption(parser):
    parser.addoption(
--- a/tests/integration/metadata.py
+++ b/tests/integration/metadata.py
@ -1,54 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.datatypes import Api
 INFERENCE_API_CAPA_TEST_MAP = {
    "chat_completion": {
        "streaming": [
            "test_text_chat_completion_streaming",
            "test_image_chat_completion_streaming",
        ],
        "non_streaming": [
            "test_image_chat_completion_non_streaming",
            "test_text_chat_completion_non_streaming",
        ],
        "tool_calling": [
            "test_text_chat_completion_with_tool_calling_and_streaming",
            "test_text_chat_completion_with_tool_calling_and_non_streaming",
        ],
        "log_probs": [
            "test_completion_log_probs_non_streaming",
            "test_completion_log_probs_streaming",
        ],
    },
    "completion": {
        "streaming": ["test_text_completion_streaming"],
        "non_streaming": ["test_text_completion_non_streaming"],
        "structured_output": ["test_text_completion_structured_output"],
    },
 }
 VECTORIO_API_TEST_MAP = {
    "retrieve": {
        "": ["test_vector_db_retrieve"],
    }
 }
 AGENTS_API_TEST_MAP = {
    "create_agent_turn": {
        "rag": ["test_rag_agent"],
        "custom_tool": ["test_custom_tool"],
        "code_execution": ["test_code_interpreter_for_attachments"],
    }
 }
 API_MAPS = {
    Api.inference: INFERENCE_API_CAPA_TEST_MAP,
    Api.vector_io: VECTORIO_API_TEST_MAP,
    Api.agents: AGENTS_API_TEST_MAP,
 }
--- a/tests/integration/report.py
+++ b/tests/integration/report.py
@ -1,220 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections import defaultdict
 from pathlib import Path
 import pytest
 from pytest import CollectReport
 from termcolor import cprint
 from llama_stack.models.llama.sku_list import (
    all_registered_models,
    llama3_1_instruct_models,
    llama3_2_instruct_models,
    llama3_3_instruct_models,
    llama3_instruct_models,
    safety_models,
 )
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.datatypes import Api
 from .metadata import API_MAPS
 def featured_models():
    models = [
        *llama3_instruct_models(),
        *llama3_1_instruct_models(),
        *llama3_2_instruct_models(),
        *llama3_3_instruct_models(),
        *safety_models(),
    ]
    return {model.huggingface_repo: model for model in models if not model.variant}
 SUPPORTED_MODELS = {
    "ollama": {
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_8b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_70b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_1_405b_instruct.value,
        CoreModelId.llama3_2_1b_instruct.value,
        CoreModelId.llama3_2_1b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_3b_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_11b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_2_90b_vision_instruct.value,
        CoreModelId.llama3_3_70b_instruct.value,
        CoreModelId.llama_guard_3_8b.value,
        CoreModelId.llama_guard_3_1b.value,
    },
    "tgi": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
    "vllm": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
 }
 class Report:
    def __init__(self, config):
        self.distro_name = None
        self.config = config
        self.output_path = Path(config.getoption("--report")) if config.getoption("--report") else None
        stack_config = self.config.getoption("--stack-config")
        if stack_config:
            is_url = stack_config.startswith("http") or "//" in stack_config
            is_yaml = stack_config.endswith(".yaml")
            if not is_url and not is_yaml:
                self.distro_name = stack_config
        self.report_data = defaultdict(dict)
        # test function -> test nodeid
        self.test_data = dict()
        self.test_name_to_nodeid = defaultdict(list)
        self.vision_model_id = None
        self.text_model_id = None
        self.client = None
    @pytest.hookimpl(tryfirst=True)
    def pytest_runtest_logreport(self, report):
        # This hook is called in several phases, including setup, call and teardown
        # The test is considered failed / error if any of the outcomes is not "Passed"
        outcome = self._process_outcome(report)
        if report.nodeid not in self.test_data:
            self.test_data[report.nodeid] = outcome
        elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
            self.test_data[report.nodeid] = outcome
    def pytest_sessionfinish(self, session):
        if not self.client:
            return
        report = []
        report.append(f"# Report for {self.distro_name} distribution")
        report.append("\n## Supported Models")
        header = f"| Model Descriptor | {self.distro_name} |"
        dividor = "|:---|:---|"
        report.append(header)
        report.append(dividor)
        rows = []
        if self.distro_name in SUPPORTED_MODELS:
            for model in all_registered_models():
                if ("Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value) or (
                    model.variant
                ):
                    continue
                row = f"| {model.core_model_id.value} |"
                if model.core_model_id.value in SUPPORTED_MODELS[self.distro_name]:
                    row += " ✅ |"
                else:
                    row += " ❌ |"
                rows.append(row)
        else:
            supported_models = {m.identifier for m in self.client.models.list()}
            for hf_name, model in featured_models().items():
                row = f"| {model.core_model_id.value} |"
                if hf_name in supported_models:
                    row += " ✅ |"
                else:
                    row += " ❌ |"
                rows.append(row)
        report.extend(rows)
        report.append("\n## Inference")
        test_table = [
            "| Model | API | Capability | Test | Status |",
            "|:----- |:-----|:-----|:-----|:-----|",
        ]
        for api, capa_map in API_MAPS[Api.inference].items():
            for capa, tests in capa_map.items():
                for test_name in tests:
                    model_id = self.text_model_id if "text" in test_name else self.vision_model_id
                    test_nodeids = self.test_name_to_nodeid[test_name]
                    if not test_nodeids:
                        continue
                    # There might be more than one parametrizations for the same test function. We take
                    # the result of the first one for now. Ideally we should mark the test as failed if
                    # any of the parametrizations failed.
                    test_table.append(
                        f"| {model_id} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
                    )
        report.extend(test_table)
        name_map = {Api.vector_io: "Vector IO", Api.agents: "Agents"}
        providers = self.client.providers.list()
        for api_group in [Api.vector_io, Api.agents]:
            api_capitalized = name_map[api_group]
            report.append(f"\n## {api_capitalized}")
            test_table = [
                "| Provider | API | Capability | Test | Status |",
                "|:-----|:-----|:-----|:-----|:-----|",
            ]
            provider = [p for p in providers if p.api == str(api_group.name)]
            provider_str = ",".join(str(p) for p in provider) if provider else ""
            for api, capa_map in API_MAPS[api_group].items():
                for capa, tests in capa_map.items():
                    for test_name in tests:
                        test_nodeids = self.test_name_to_nodeid[test_name]
                        if not test_nodeids:
                            continue
                        test_table.append(
                            f"| {provider_str} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
                        )
            report.extend(test_table)
        output_file = self.output_path
        text = "\n".join(report) + "\n"
        output_file.write_text(text)
        cprint(f"\nReport generated: {output_file.absolute()}", "green")
    def pytest_runtest_makereport(self, item, call):
        func_name = getattr(item, "originalname", item.name)
        self.test_name_to_nodeid[func_name].append(item.nodeid)
        # Get values from fixtures for report output
        if model_id := item.funcargs.get("text_model_id"):
            parts = model_id.split("/")
            text_model = parts[1] if len(parts) > 1 else model_id
            self.text_model_id = self.text_model_id or text_model
        elif model_id := item.funcargs.get("vision_model_id"):
            parts = model_id.split("/")
            vision_model = parts[1] if len(parts) > 1 else model_id
            self.vision_model_id = self.vision_model_id or vision_model
        if not self.client:
            self.client = item.funcargs.get("llama_stack_client")
    def _print_result_icon(self, result):
        if result == "Passed":
            return "✅"
        elif result == "Failed" or result == "Error":
            return "❌"
        else:
            #  result == "Skipped":
            return "⏭️"
    def _process_outcome(self, report: CollectReport):
        if self._is_error(report):
            return "Error"
        if hasattr(report, "wasxfail"):
            if report.outcome in ["passed", "failed"]:
                return "XPassed"
            if report.outcome == "skipped":
                return "XFailed"
        return report.outcome.capitalize()
    def _is_error(self, report: CollectReport):
        return report.when in ["setup", "teardown", "collect"] and report.outcome == "failed"