From 26dffff92a5bb09df1be620ae1216ac7089615fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 14 May 2025 07:40:15 +0200 Subject: [PATCH] chore: remove pytest reports (#2156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Cleanup old test code too. Signed-off-by: Sébastien Han --- llama_stack/providers/tests/__init__.py | 5 - .../providers/tests/ci_test_config.yaml | 55 ---- llama_stack/providers/tests/conftest.py | 296 ------------------ llama_stack/providers/tests/report.py | 176 ----------- llama_stack/templates/cerebras/report.md | 43 --- llama_stack/templates/fireworks/report.md | 45 --- llama_stack/templates/ollama/report.md | 43 --- llama_stack/templates/tgi/report.md | 44 --- llama_stack/templates/together/report.md | 45 --- tests/integration/README.md | 1 - tests/integration/conftest.py | 5 - tests/integration/metadata.py | 54 ---- tests/integration/report.py | 220 ------------- 13 files changed, 1032 deletions(-) delete mode 100644 llama_stack/providers/tests/__init__.py delete mode 100644 llama_stack/providers/tests/ci_test_config.yaml delete mode 100644 llama_stack/providers/tests/conftest.py delete mode 100644 llama_stack/providers/tests/report.py delete mode 100644 llama_stack/templates/cerebras/report.md delete mode 100644 llama_stack/templates/fireworks/report.md delete mode 100644 llama_stack/templates/ollama/report.md delete mode 100644 llama_stack/templates/tgi/report.md delete mode 100644 llama_stack/templates/together/report.md delete mode 100644 tests/integration/metadata.py delete mode 100644 tests/integration/report.py diff --git a/llama_stack/providers/tests/__init__.py b/llama_stack/providers/tests/__init__.py deleted file mode 100644 index 756f351d8..000000000 --- a/llama_stack/providers/tests/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. diff --git a/llama_stack/providers/tests/ci_test_config.yaml b/llama_stack/providers/tests/ci_test_config.yaml deleted file mode 100644 index 3edcd38bf..000000000 --- a/llama_stack/providers/tests/ci_test_config.yaml +++ /dev/null @@ -1,55 +0,0 @@ -inference: - tests: - - inference/test_vision_inference.py::test_vision_chat_completion_streaming - - inference/test_vision_inference.py::test_vision_chat_completion_non_streaming - - inference/test_text_inference.py::test_structured_output - - inference/test_text_inference.py::test_chat_completion_streaming - - inference/test_text_inference.py::test_chat_completion_non_streaming - - inference/test_text_inference.py::test_chat_completion_with_tool_calling - - inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming - - scenarios: - - provider_fixtures: - inference: ollama - - fixture_combo_id: fireworks - - provider_fixtures: - inference: together - # - inference: tgi - # - inference: vllm_remote - - inference_models: - - meta-llama/Llama-3.1-8B-Instruct - - meta-llama/Llama-3.2-11B-Vision-Instruct - - -agents: - tests: - - agents/test_agents.py::test_agent_turns_with_safety - - agents/test_agents.py::test_rag_agent - - scenarios: - - fixture_combo_id: ollama - - fixture_combo_id: together - - fixture_combo_id: fireworks - - inference_models: - - meta-llama/Llama-3.2-1B-Instruct - - safety_shield: meta-llama/Llama-Guard-3-1B - - -memory: - tests: - - memory/test_memory.py::test_query_documents - - scenarios: - - fixture_combo_id: ollama - - provider_fixtures: - inference: sentence_transformers - memory: faiss - - fixture_combo_id: chroma - - inference_models: - - meta-llama/Llama-3.2-1B-Instruct - - embedding_model: all-MiniLM-L6-v2 diff --git a/llama_stack/providers/tests/conftest.py b/llama_stack/providers/tests/conftest.py deleted file mode 100644 index cd86af0d6..000000000 --- a/llama_stack/providers/tests/conftest.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import os -from collections import defaultdict -from pathlib import Path -from typing import Any - -import pytest -import yaml -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from termcolor import colored - -from llama_stack.distribution.datatypes import Provider -from llama_stack.providers.datatypes import RemoteProviderConfig - -from .env import get_env_or_fail -from .report import Report - - -class ProviderFixture(BaseModel): - providers: list[Provider] - provider_data: dict[str, Any] | None = None - - -class TestScenario(BaseModel): - # provider fixtures can be either a mark or a dictionary of api -> providers - provider_fixtures: dict[str, str] = Field(default_factory=dict) - fixture_combo_id: str | None = None - - -class APITestConfig(BaseModel): - scenarios: list[TestScenario] = Field(default_factory=list) - inference_models: list[str] = Field(default_factory=list) - - # test name format should be :: - tests: list[str] = Field(default_factory=list) - - -class MemoryApiTestConfig(APITestConfig): - embedding_model: str | None = Field(default_factory=None) - - -class AgentsApiTestConfig(APITestConfig): - safety_shield: str | None = Field(default_factory=None) - - -class TestConfig(BaseModel): - inference: APITestConfig | None = None - agents: AgentsApiTestConfig | None = None - memory: MemoryApiTestConfig | None = None - - -def get_test_config_from_config_file(metafunc_config): - config_file = metafunc_config.getoption("--config") - if config_file is None: - return None - - config_file_path = Path(__file__).parent / config_file - if not config_file_path.exists(): - raise ValueError( - f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory." - ) - with open(config_file_path) as config_file: - config = yaml.safe_load(config_file) - return TestConfig(**config) - - -def get_test_config_for_api(metafunc_config, api): - test_config = get_test_config_from_config_file(metafunc_config) - if test_config is None: - return None - return getattr(test_config, api) - - -def get_provider_fixture_overrides_from_test_config(metafunc_config, api, default_provider_fixture_combinations): - api_config = get_test_config_for_api(metafunc_config, api) - if api_config is None: - return None - - fixture_combo_ids = set() - custom_provider_fixture_combos = [] - for scenario in api_config.scenarios: - if scenario.fixture_combo_id: - fixture_combo_ids.add(scenario.fixture_combo_id) - else: - custom_provider_fixture_combos.append( - pytest.param( - scenario.provider_fixtures, - id=scenario.provider_fixtures.get("inference") or "", - ) - ) - - if len(fixture_combo_ids) > 0: - for default_fixture in default_provider_fixture_combinations: - if default_fixture.id in fixture_combo_ids: - custom_provider_fixture_combos.append(default_fixture) - return custom_provider_fixture_combos - - -def remote_stack_fixture() -> ProviderFixture: - if url := os.getenv("REMOTE_STACK_URL", None): - config = RemoteProviderConfig.from_url(url) - else: - config = RemoteProviderConfig( - host=get_env_or_fail("REMOTE_STACK_HOST"), - port=int(get_env_or_fail("REMOTE_STACK_PORT")), - ) - return ProviderFixture( - providers=[ - Provider( - provider_id="test::remote", - provider_type="test::remote", - config=config.model_dump(), - ) - ], - ) - - -def pytest_configure(config): - config.option.tbstyle = "short" - config.option.disable_warnings = True - - """Load environment variables at start of test run""" - # Load from .env file if it exists - env_file = Path(__file__).parent / ".env" - if env_file.exists(): - load_dotenv(env_file) - - # Load any environment variables passed via --env - env_vars = config.getoption("--env") or [] - for env_var in env_vars: - key, value = env_var.split("=", 1) - os.environ[key] = value - - if config.getoption("--output") is not None: - config.pluginmanager.register(Report(config.getoption("--output"))) - - -def pytest_addoption(parser): - parser.addoption( - "--providers", - default="", - help=( - "Provider configuration in format: api1=provider1,api2=provider2. " - "Example: --providers inference=ollama,safety=meta-reference" - ), - ) - parser.addoption( - "--config", - action="store", - help="Set test config file (supported format: YAML), e.g. --config=test_config.yml", - ) - parser.addoption( - "--output", - action="store", - help="Set output file for test report, e.g. --output=pytest_report.md", - ) - """Add custom command line options""" - parser.addoption("--env", action="append", help="Set environment variables, e.g. --env KEY=value") - parser.addoption( - "--inference-model", - action="store", - default="meta-llama/Llama-3.2-3B-Instruct", - help="Specify the inference model to use for testing", - ) - parser.addoption( - "--safety-shield", - action="store", - default="meta-llama/Llama-Guard-3-1B", - help="Specify the safety shield to use for testing", - ) - parser.addoption( - "--embedding-model", - action="store", - default=None, - help="Specify the embedding model to use for testing", - ) - parser.addoption( - "--judge-model", - action="store", - default="meta-llama/Llama-3.1-8B-Instruct", - help="Specify the judge model to use for testing", - ) - - -def make_provider_id(providers: dict[str, str]) -> str: - return ":".join(f"{api}={provider}" for api, provider in sorted(providers.items())) - - -def get_provider_marks(providers: dict[str, str]) -> list[Any]: - marks = [] - for provider in providers.values(): - marks.append(getattr(pytest.mark, provider)) - return marks - - -def get_provider_fixture_overrides(config, available_fixtures: dict[str, list[str]]) -> list[pytest.param] | None: - provider_str = config.getoption("--providers") - if not provider_str: - return None - - fixture_dict = parse_fixture_string(provider_str, available_fixtures) - return [ - pytest.param( - fixture_dict, - id=make_provider_id(fixture_dict), - marks=get_provider_marks(fixture_dict), - ) - ] - - -def parse_fixture_string(provider_str: str, available_fixtures: dict[str, list[str]]) -> dict[str, str]: - """Parse provider string of format 'api1=provider1,api2=provider2'""" - if not provider_str: - return {} - - fixtures = {} - pairs = provider_str.split(",") - for pair in pairs: - if "=" not in pair: - raise ValueError(f"Invalid provider specification: {pair}. Expected format: api=provider") - api, fixture = pair.split("=") - if api not in available_fixtures: - raise ValueError(f"Unknown API: {api}. Available APIs: {list(available_fixtures.keys())}") - if fixture not in available_fixtures[api]: - raise ValueError( - f"Unknown provider '{fixture}' for API '{api}'. Available providers: {list(available_fixtures[api])}" - ) - fixtures[api] = fixture - - # Check that all provided APIs are supported - for api in available_fixtures.keys(): - if api not in fixtures: - raise ValueError( - f"Missing provider fixture for API '{api}'. Available providers: {list(available_fixtures[api])}" - ) - return fixtures - - -def pytest_itemcollected(item): - # Get all markers as a list - filtered = ("asyncio", "parametrize") - marks = [mark.name for mark in item.iter_markers() if mark.name not in filtered] - if marks: - marks = colored(",".join(marks), "yellow") - item.name = f"{item.name}[{marks}]" - - -def pytest_collection_modifyitems(session, config, items): - test_config = get_test_config_from_config_file(config) - if test_config is None: - return - - required_tests = defaultdict(set) - for api_test_config in [ - test_config.inference, - test_config.memory, - test_config.agents, - ]: - if api_test_config is None: - continue - for test in api_test_config.tests: - arr = test.split("::") - if len(arr) != 2: - raise ValueError(f"Invalid format for test name {test}") - test_path, func_name = arr - required_tests[Path(__file__).parent / test_path].add(func_name) - - new_items, deselected_items = [], [] - for item in items: - func_name = getattr(item, "originalname", item.name) - if func_name in required_tests[item.fspath]: - new_items.append(item) - continue - deselected_items.append(item) - - items[:] = new_items - config.hook.pytest_deselected(items=deselected_items) - - -pytest_plugins = [ - "llama_stack.providers.tests.inference.fixtures", - "llama_stack.providers.tests.safety.fixtures", - "llama_stack.providers.tests.vector_io.fixtures", - "llama_stack.providers.tests.agents.fixtures", - "llama_stack.providers.tests.datasetio.fixtures", - "llama_stack.providers.tests.scoring.fixtures", - "llama_stack.providers.tests.eval.fixtures", - "llama_stack.providers.tests.post_training.fixtures", - "llama_stack.providers.tests.tools.fixtures", -] diff --git a/llama_stack/providers/tests/report.py b/llama_stack/providers/tests/report.py deleted file mode 100644 index bc29534be..000000000 --- a/llama_stack/providers/tests/report.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -from collections import defaultdict -from pathlib import Path - -import pytest -from pytest import ExitCode -from pytest_html.basereport import _process_outcome - -from llama_stack.models.llama.sku_list import all_registered_models -from llama_stack.models.llama.sku_types import CoreModelId - -INFERENCE_APIS = ["chat_completion"] -FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"] -SUPPORTED_MODELS = { - "ollama": { - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_2_1b_instruct.value, - CoreModelId.llama3_2_1b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_3_70b_instruct.value, - CoreModelId.llama_guard_3_8b.value, - CoreModelId.llama_guard_3_1b.value, - }, - "fireworks": { - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_2_1b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_3_70b_instruct.value, - CoreModelId.llama_guard_3_8b.value, - CoreModelId.llama_guard_3_11b_vision.value, - }, - "together": { - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_3_70b_instruct.value, - CoreModelId.llama_guard_3_8b.value, - CoreModelId.llama_guard_3_11b_vision.value, - }, -} - - -class Report: - def __init__(self, output_path): - valid_file_format = ( - output_path.split(".")[1] in ["md", "markdown"] if len(output_path.split(".")) == 2 else False - ) - if not valid_file_format: - raise ValueError(f"Invalid output file {output_path}. Markdown file is required") - self.output_path = output_path - self.test_data = defaultdict(dict) - self.inference_tests = defaultdict(dict) - - @pytest.hookimpl - def pytest_runtest_logreport(self, report): - # This hook is called in several phases, including setup, call and teardown - # The test is considered failed / error if any of the outcomes is not "Passed" - outcome = _process_outcome(report) - data = { - "outcome": report.outcome, - "longrepr": report.longrepr, - "name": report.nodeid, - } - if report.nodeid not in self.test_data: - self.test_data[report.nodeid] = data - elif self.test_data[report.nodeid] != outcome and outcome != "Passed": - self.test_data[report.nodeid] = data - - @pytest.hookimpl - def pytest_sessionfinish(self, session, exitstatus): - if exitstatus <= ExitCode.INTERRUPTED: - return - report = [] - report.append("# Llama Stack Integration Test Results Report") - report.append("\n## Summary") - report.append("\n## Supported Models: ") - - header = "| Model Descriptor |" - dividor = "|:---|" - for k in SUPPORTED_MODELS.keys(): - header += f"{k} |" - dividor += ":---:|" - - report.append(header) - report.append(dividor) - - rows = [] - for model in all_registered_models(): - if "Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value: - continue - row = f"| {model.core_model_id.value} |" - for k in SUPPORTED_MODELS.keys(): - if model.core_model_id.value in SUPPORTED_MODELS[k]: - row += " ✅ |" - else: - row += " ❌ |" - rows.append(row) - report.extend(rows) - - report.append("\n### Tests:") - - for provider in SUPPORTED_MODELS.keys(): - if provider not in self.inference_tests: - continue - report.append(f"\n #### {provider}") - test_table = [ - "| Area | Model | API | Functionality Test | Status |", - "|:-----|:-----|:-----|:-----|:-----|", - ] - for api in INFERENCE_APIS: - tests = self.inference_tests[provider][api] - for test_nodeid in tests: - row = "|{area} | {model} | {api} | {test} | {result} ".format( - area="Text" if "text" in test_nodeid else "Vision", - model=("Llama-3.1-8B-Instruct" if "text" in test_nodeid else "Llama3.2-11B-Vision-Instruct"), - api=f"/{api}", - test=self.get_simple_function_name(test_nodeid), - result=("✅" if self.test_data[test_nodeid]["outcome"] == "passed" else "❌"), - ) - test_table += [row] - report.extend(test_table) - report.append("\n") - - output_file = Path(self.output_path) - output_file.write_text("\n".join(report)) - print(f"\n Report generated: {output_file.absolute()}") - - @pytest.hookimpl(trylast=True) - def pytest_collection_modifyitems(self, session, config, items): - for item in items: - inference = item.callspec.params.get("inference_stack") - if "inference" in item.nodeid: - func_name = getattr(item, "originalname", item.name) - for api in INFERENCE_APIS: - if api in func_name: - api_tests = self.inference_tests[inference].get(api, set()) - api_tests.add(item.nodeid) - self.inference_tests[inference][api] = api_tests - - def get_simple_function_name(self, nodeid): - """Extract function name from nodeid. - - Examples: - - 'tests/test_math.py::test_addition' -> 'test_addition' - - 'tests/test_math.py::TestClass::test_method' -> test_method' - """ - parts = nodeid.split("::") - func_name = nodeid # Fallback to full nodeid if pattern doesn't match - if len(parts) == 2: # Simple function - func_name = parts[1] - elif len(parts) == 3: # Class method - func_name = parts[2] - return func_name.split("[")[0] diff --git a/llama_stack/templates/cerebras/report.md b/llama_stack/templates/cerebras/report.md deleted file mode 100644 index f240e354b..000000000 --- a/llama_stack/templates/cerebras/report.md +++ /dev/null @@ -1,43 +0,0 @@ -# Report for cerebras distribution - -## Supported Models -| Model Descriptor | cerebras | -|:---|:---| -| meta-llama/Llama-3-8B-Instruct | ❌ | -| meta-llama/Llama-3-70B-Instruct | ❌ | -| meta-llama/Llama-3.1-8B-Instruct | ✅ | -| meta-llama/Llama-3.1-70B-Instruct | ❌ | -| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ | -| meta-llama/Llama-3.2-1B-Instruct | ❌ | -| meta-llama/Llama-3.2-3B-Instruct | ❌ | -| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ | -| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ | -| meta-llama/Llama-3.3-70B-Instruct | ✅ | -| meta-llama/Llama-Guard-3-11B-Vision | ❌ | -| meta-llama/Llama-Guard-3-1B | ❌ | -| meta-llama/Llama-Guard-3-8B | ❌ | -| meta-llama/Llama-Guard-2-8B | ❌ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ | -| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ❌ | - -## Vector IO -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /retrieve | | test_vector_db_retrieve | ✅ | - -## Agents -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /create_agent_turn | rag | test_rag_agent | ✅ | -| /create_agent_turn | custom_tool | test_custom_tool | ❌ | diff --git a/llama_stack/templates/fireworks/report.md b/llama_stack/templates/fireworks/report.md deleted file mode 100644 index b520acf8e..000000000 --- a/llama_stack/templates/fireworks/report.md +++ /dev/null @@ -1,45 +0,0 @@ -# Report for fireworks distribution - -## Supported Models -| Model Descriptor | fireworks | -|:---|:---| -| Llama-3-8B-Instruct | ❌ | -| Llama-3-70B-Instruct | ❌ | -| Llama3.1-8B-Instruct | ✅ | -| Llama3.1-70B-Instruct | ✅ | -| Llama3.1-405B-Instruct | ✅ | -| Llama3.2-1B-Instruct | ✅ | -| Llama3.2-3B-Instruct | ✅ | -| Llama3.2-11B-Vision-Instruct | ✅ | -| Llama3.2-90B-Vision-Instruct | ✅ | -| Llama3.3-70B-Instruct | ✅ | -| Llama-Guard-3-11B-Vision | ✅ | -| Llama-Guard-3-1B | ❌ | -| Llama-Guard-3-8B | ✅ | -| Llama-Guard-2-8B | ❌ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ | - -## Vector IO -| Provider | API | Capability | Test | Status | -|:-----|:-----|:-----|:-----|:-----| -| inline::faiss | /retrieve | | test_vector_db_retrieve | ✅ | - -## Agents -| Provider | API | Capability | Test | Status | -|:-----|:-----|:-----|:-----|:-----| -| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ | -| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ | diff --git a/llama_stack/templates/ollama/report.md b/llama_stack/templates/ollama/report.md deleted file mode 100644 index 4b2dada3a..000000000 --- a/llama_stack/templates/ollama/report.md +++ /dev/null @@ -1,43 +0,0 @@ -# Report for ollama distribution - -## Supported Models -| Model Descriptor | ollama | -|:---|:---| -| Llama-3-8B-Instruct | ❌ | -| Llama-3-70B-Instruct | ❌ | -| Llama3.1-8B-Instruct | ✅ | -| Llama3.1-70B-Instruct | ✅ | -| Llama3.1-405B-Instruct | ✅ | -| Llama3.2-1B-Instruct | ✅ | -| Llama3.2-3B-Instruct | ✅ | -| Llama3.2-11B-Vision-Instruct | ✅ | -| Llama3.2-90B-Vision-Instruct | ✅ | -| Llama3.3-70B-Instruct | ✅ | -| Llama-Guard-3-11B-Vision | ❌ | -| Llama-Guard-3-1B | ✅ | -| Llama-Guard-3-8B | ✅ | -| Llama-Guard-2-8B | ❌ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ | -| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ | - -## Vector IO -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /retrieve | | test_vector_db_retrieve | ✅ | - -## Agents -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /create_agent_turn | rag | test_rag_agent | ✅ | -| /create_agent_turn | custom_tool | test_custom_tool | ✅ | diff --git a/llama_stack/templates/tgi/report.md b/llama_stack/templates/tgi/report.md deleted file mode 100644 index b0f5d88a2..000000000 --- a/llama_stack/templates/tgi/report.md +++ /dev/null @@ -1,44 +0,0 @@ -# Report for tgi distribution - -## Supported Models -| Model Descriptor | tgi | -|:---|:---| -| Llama-3-8B-Instruct | ✅ | -| Llama-3-70B-Instruct | ✅ | -| Llama3.1-8B-Instruct | ✅ | -| Llama3.1-70B-Instruct | ✅ | -| Llama3.1-405B-Instruct | ✅ | -| Llama3.2-1B-Instruct | ✅ | -| Llama3.2-3B-Instruct | ✅ | -| Llama3.2-11B-Vision-Instruct | ✅ | -| Llama3.2-90B-Vision-Instruct | ✅ | -| Llama3.3-70B-Instruct | ✅ | -| Llama-Guard-3-11B-Vision | ✅ | -| Llama-Guard-3-1B | ✅ | -| Llama-Guard-3-8B | ✅ | -| Llama-Guard-2-8B | ✅ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ | -| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ | - -## Vector IO -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /retrieve | | test_vector_db_retrieve | ✅ | - -## Agents -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /create_agent_turn | rag | test_rag_agent | ✅ | -| /create_agent_turn | custom_tool | test_custom_tool | ✅ | -| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ | diff --git a/llama_stack/templates/together/report.md b/llama_stack/templates/together/report.md deleted file mode 100644 index 71ae83597..000000000 --- a/llama_stack/templates/together/report.md +++ /dev/null @@ -1,45 +0,0 @@ -# Report for together distribution - -## Supported Models -| Model Descriptor | together | -|:---|:---| -| Llama-3-8B-Instruct | ❌ | -| Llama-3-70B-Instruct | ❌ | -| Llama3.1-8B-Instruct | ✅ | -| Llama3.1-70B-Instruct | ✅ | -| Llama3.1-405B-Instruct | ✅ | -| Llama3.2-1B-Instruct | ❌ | -| Llama3.2-3B-Instruct | ✅ | -| Llama3.2-11B-Vision-Instruct | ✅ | -| Llama3.2-90B-Vision-Instruct | ✅ | -| Llama3.3-70B-Instruct | ✅ | -| Llama-Guard-3-11B-Vision | ✅ | -| Llama-Guard-3-1B | ❌ | -| Llama-Guard-3-8B | ✅ | -| Llama-Guard-2-8B | ❌ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | -| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ | -| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | -| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ | - -## Vector IO -| Provider | API | Capability | Test | Status | -|:-----|:-----|:-----|:-----|:-----| -| inline::faiss | /retrieve | | test_vector_db_retrieve | ✅ | - -## Agents -| Provider | API | Capability | Test | Status | -|:-----|:-----|:-----|:-----|:-----| -| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ | -| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ | diff --git a/tests/integration/README.md b/tests/integration/README.md index 8c1ee6355..31d58c83f 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -28,7 +28,6 @@ if no model is specified. Experimental, under development, options: - `--record-responses`: record new API responses instead of using cached ones -- `--report`: path where the test report should be written, e.g. --report=/path/to/report.md ## Examples diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 131219e52..ec5918268 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,8 +15,6 @@ from dotenv import load_dotenv from llama_stack.log import get_logger -from .report import Report - logger = get_logger(__name__, category="tests") @@ -60,9 +58,6 @@ def pytest_configure(config): os.environ["DISABLE_CODE_SANDBOX"] = "1" logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS") - if config.getoption("--report"): - config.pluginmanager.register(Report(config)) - def pytest_addoption(parser): parser.addoption( diff --git a/tests/integration/metadata.py b/tests/integration/metadata.py deleted file mode 100644 index 55663c046..000000000 --- a/tests/integration/metadata.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.providers.datatypes import Api - -INFERENCE_API_CAPA_TEST_MAP = { - "chat_completion": { - "streaming": [ - "test_text_chat_completion_streaming", - "test_image_chat_completion_streaming", - ], - "non_streaming": [ - "test_image_chat_completion_non_streaming", - "test_text_chat_completion_non_streaming", - ], - "tool_calling": [ - "test_text_chat_completion_with_tool_calling_and_streaming", - "test_text_chat_completion_with_tool_calling_and_non_streaming", - ], - "log_probs": [ - "test_completion_log_probs_non_streaming", - "test_completion_log_probs_streaming", - ], - }, - "completion": { - "streaming": ["test_text_completion_streaming"], - "non_streaming": ["test_text_completion_non_streaming"], - "structured_output": ["test_text_completion_structured_output"], - }, -} - -VECTORIO_API_TEST_MAP = { - "retrieve": { - "": ["test_vector_db_retrieve"], - } -} - -AGENTS_API_TEST_MAP = { - "create_agent_turn": { - "rag": ["test_rag_agent"], - "custom_tool": ["test_custom_tool"], - "code_execution": ["test_code_interpreter_for_attachments"], - } -} - - -API_MAPS = { - Api.inference: INFERENCE_API_CAPA_TEST_MAP, - Api.vector_io: VECTORIO_API_TEST_MAP, - Api.agents: AGENTS_API_TEST_MAP, -} diff --git a/tests/integration/report.py b/tests/integration/report.py deleted file mode 100644 index 97543fa9d..000000000 --- a/tests/integration/report.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -from collections import defaultdict -from pathlib import Path - -import pytest -from pytest import CollectReport -from termcolor import cprint - -from llama_stack.models.llama.sku_list import ( - all_registered_models, - llama3_1_instruct_models, - llama3_2_instruct_models, - llama3_3_instruct_models, - llama3_instruct_models, - safety_models, -) -from llama_stack.models.llama.sku_types import CoreModelId -from llama_stack.providers.datatypes import Api - -from .metadata import API_MAPS - - -def featured_models(): - models = [ - *llama3_instruct_models(), - *llama3_1_instruct_models(), - *llama3_2_instruct_models(), - *llama3_3_instruct_models(), - *safety_models(), - ] - return {model.huggingface_repo: model for model in models if not model.variant} - - -SUPPORTED_MODELS = { - "ollama": { - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_8b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_70b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_1_405b_instruct.value, - CoreModelId.llama3_2_1b_instruct.value, - CoreModelId.llama3_2_1b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_3b_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_11b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_2_90b_vision_instruct.value, - CoreModelId.llama3_3_70b_instruct.value, - CoreModelId.llama_guard_3_8b.value, - CoreModelId.llama_guard_3_1b.value, - }, - "tgi": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo}, - "vllm": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo}, -} - - -class Report: - def __init__(self, config): - self.distro_name = None - self.config = config - self.output_path = Path(config.getoption("--report")) if config.getoption("--report") else None - - stack_config = self.config.getoption("--stack-config") - if stack_config: - is_url = stack_config.startswith("http") or "//" in stack_config - is_yaml = stack_config.endswith(".yaml") - if not is_url and not is_yaml: - self.distro_name = stack_config - - self.report_data = defaultdict(dict) - # test function -> test nodeid - self.test_data = dict() - self.test_name_to_nodeid = defaultdict(list) - self.vision_model_id = None - self.text_model_id = None - self.client = None - - @pytest.hookimpl(tryfirst=True) - def pytest_runtest_logreport(self, report): - # This hook is called in several phases, including setup, call and teardown - # The test is considered failed / error if any of the outcomes is not "Passed" - outcome = self._process_outcome(report) - if report.nodeid not in self.test_data: - self.test_data[report.nodeid] = outcome - elif self.test_data[report.nodeid] != outcome and outcome != "Passed": - self.test_data[report.nodeid] = outcome - - def pytest_sessionfinish(self, session): - if not self.client: - return - - report = [] - report.append(f"# Report for {self.distro_name} distribution") - report.append("\n## Supported Models") - - header = f"| Model Descriptor | {self.distro_name} |" - dividor = "|:---|:---|" - - report.append(header) - report.append(dividor) - - rows = [] - if self.distro_name in SUPPORTED_MODELS: - for model in all_registered_models(): - if ("Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value) or ( - model.variant - ): - continue - row = f"| {model.core_model_id.value} |" - if model.core_model_id.value in SUPPORTED_MODELS[self.distro_name]: - row += " ✅ |" - else: - row += " ❌ |" - rows.append(row) - else: - supported_models = {m.identifier for m in self.client.models.list()} - for hf_name, model in featured_models().items(): - row = f"| {model.core_model_id.value} |" - if hf_name in supported_models: - row += " ✅ |" - else: - row += " ❌ |" - rows.append(row) - report.extend(rows) - - report.append("\n## Inference") - test_table = [ - "| Model | API | Capability | Test | Status |", - "|:----- |:-----|:-----|:-----|:-----|", - ] - for api, capa_map in API_MAPS[Api.inference].items(): - for capa, tests in capa_map.items(): - for test_name in tests: - model_id = self.text_model_id if "text" in test_name else self.vision_model_id - test_nodeids = self.test_name_to_nodeid[test_name] - if not test_nodeids: - continue - - # There might be more than one parametrizations for the same test function. We take - # the result of the first one for now. Ideally we should mark the test as failed if - # any of the parametrizations failed. - test_table.append( - f"| {model_id} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |" - ) - - report.extend(test_table) - - name_map = {Api.vector_io: "Vector IO", Api.agents: "Agents"} - providers = self.client.providers.list() - for api_group in [Api.vector_io, Api.agents]: - api_capitalized = name_map[api_group] - report.append(f"\n## {api_capitalized}") - test_table = [ - "| Provider | API | Capability | Test | Status |", - "|:-----|:-----|:-----|:-----|:-----|", - ] - provider = [p for p in providers if p.api == str(api_group.name)] - provider_str = ",".join(str(p) for p in provider) if provider else "" - for api, capa_map in API_MAPS[api_group].items(): - for capa, tests in capa_map.items(): - for test_name in tests: - test_nodeids = self.test_name_to_nodeid[test_name] - if not test_nodeids: - continue - test_table.append( - f"| {provider_str} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |" - ) - report.extend(test_table) - - output_file = self.output_path - text = "\n".join(report) + "\n" - output_file.write_text(text) - cprint(f"\nReport generated: {output_file.absolute()}", "green") - - def pytest_runtest_makereport(self, item, call): - func_name = getattr(item, "originalname", item.name) - self.test_name_to_nodeid[func_name].append(item.nodeid) - - # Get values from fixtures for report output - if model_id := item.funcargs.get("text_model_id"): - parts = model_id.split("/") - text_model = parts[1] if len(parts) > 1 else model_id - self.text_model_id = self.text_model_id or text_model - elif model_id := item.funcargs.get("vision_model_id"): - parts = model_id.split("/") - vision_model = parts[1] if len(parts) > 1 else model_id - self.vision_model_id = self.vision_model_id or vision_model - - if not self.client: - self.client = item.funcargs.get("llama_stack_client") - - def _print_result_icon(self, result): - if result == "Passed": - return "✅" - elif result == "Failed" or result == "Error": - return "❌" - else: - # result == "Skipped": - return "⏭️" - - def _process_outcome(self, report: CollectReport): - if self._is_error(report): - return "Error" - if hasattr(report, "wasxfail"): - if report.outcome in ["passed", "failed"]: - return "XPassed" - if report.outcome == "skipped": - return "XFailed" - return report.outcome.capitalize() - - def _is_error(self, report: CollectReport): - return report.when in ["setup", "teardown", "collect"] and report.outcome == "failed"