forked from phoenix-oss/llama-stack-mirror
chore: remove pytest reports (#2156)
# What does this PR do? Cleanup old test code too. Signed-off-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
parent
8e316c9b1e
commit
26dffff92a
13 changed files with 0 additions and 1032 deletions
|
@ -1,5 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
|
@ -1,55 +0,0 @@
|
||||||
inference:
|
|
||||||
tests:
|
|
||||||
- inference/test_vision_inference.py::test_vision_chat_completion_streaming
|
|
||||||
- inference/test_vision_inference.py::test_vision_chat_completion_non_streaming
|
|
||||||
- inference/test_text_inference.py::test_structured_output
|
|
||||||
- inference/test_text_inference.py::test_chat_completion_streaming
|
|
||||||
- inference/test_text_inference.py::test_chat_completion_non_streaming
|
|
||||||
- inference/test_text_inference.py::test_chat_completion_with_tool_calling
|
|
||||||
- inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming
|
|
||||||
|
|
||||||
scenarios:
|
|
||||||
- provider_fixtures:
|
|
||||||
inference: ollama
|
|
||||||
- fixture_combo_id: fireworks
|
|
||||||
- provider_fixtures:
|
|
||||||
inference: together
|
|
||||||
# - inference: tgi
|
|
||||||
# - inference: vllm_remote
|
|
||||||
|
|
||||||
inference_models:
|
|
||||||
- meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
- meta-llama/Llama-3.2-11B-Vision-Instruct
|
|
||||||
|
|
||||||
|
|
||||||
agents:
|
|
||||||
tests:
|
|
||||||
- agents/test_agents.py::test_agent_turns_with_safety
|
|
||||||
- agents/test_agents.py::test_rag_agent
|
|
||||||
|
|
||||||
scenarios:
|
|
||||||
- fixture_combo_id: ollama
|
|
||||||
- fixture_combo_id: together
|
|
||||||
- fixture_combo_id: fireworks
|
|
||||||
|
|
||||||
inference_models:
|
|
||||||
- meta-llama/Llama-3.2-1B-Instruct
|
|
||||||
|
|
||||||
safety_shield: meta-llama/Llama-Guard-3-1B
|
|
||||||
|
|
||||||
|
|
||||||
memory:
|
|
||||||
tests:
|
|
||||||
- memory/test_memory.py::test_query_documents
|
|
||||||
|
|
||||||
scenarios:
|
|
||||||
- fixture_combo_id: ollama
|
|
||||||
- provider_fixtures:
|
|
||||||
inference: sentence_transformers
|
|
||||||
memory: faiss
|
|
||||||
- fixture_combo_id: chroma
|
|
||||||
|
|
||||||
inference_models:
|
|
||||||
- meta-llama/Llama-3.2-1B-Instruct
|
|
||||||
|
|
||||||
embedding_model: all-MiniLM-L6-v2
|
|
|
@ -1,296 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import defaultdict
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import yaml
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
from termcolor import colored
|
|
||||||
|
|
||||||
from llama_stack.distribution.datatypes import Provider
|
|
||||||
from llama_stack.providers.datatypes import RemoteProviderConfig
|
|
||||||
|
|
||||||
from .env import get_env_or_fail
|
|
||||||
from .report import Report
|
|
||||||
|
|
||||||
|
|
||||||
class ProviderFixture(BaseModel):
|
|
||||||
providers: list[Provider]
|
|
||||||
provider_data: dict[str, Any] | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class TestScenario(BaseModel):
|
|
||||||
# provider fixtures can be either a mark or a dictionary of api -> providers
|
|
||||||
provider_fixtures: dict[str, str] = Field(default_factory=dict)
|
|
||||||
fixture_combo_id: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class APITestConfig(BaseModel):
|
|
||||||
scenarios: list[TestScenario] = Field(default_factory=list)
|
|
||||||
inference_models: list[str] = Field(default_factory=list)
|
|
||||||
|
|
||||||
# test name format should be <relative_path.py>::<test_name>
|
|
||||||
tests: list[str] = Field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
class MemoryApiTestConfig(APITestConfig):
|
|
||||||
embedding_model: str | None = Field(default_factory=None)
|
|
||||||
|
|
||||||
|
|
||||||
class AgentsApiTestConfig(APITestConfig):
|
|
||||||
safety_shield: str | None = Field(default_factory=None)
|
|
||||||
|
|
||||||
|
|
||||||
class TestConfig(BaseModel):
|
|
||||||
inference: APITestConfig | None = None
|
|
||||||
agents: AgentsApiTestConfig | None = None
|
|
||||||
memory: MemoryApiTestConfig | None = None
|
|
||||||
|
|
||||||
|
|
||||||
def get_test_config_from_config_file(metafunc_config):
|
|
||||||
config_file = metafunc_config.getoption("--config")
|
|
||||||
if config_file is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
config_file_path = Path(__file__).parent / config_file
|
|
||||||
if not config_file_path.exists():
|
|
||||||
raise ValueError(
|
|
||||||
f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory."
|
|
||||||
)
|
|
||||||
with open(config_file_path) as config_file:
|
|
||||||
config = yaml.safe_load(config_file)
|
|
||||||
return TestConfig(**config)
|
|
||||||
|
|
||||||
|
|
||||||
def get_test_config_for_api(metafunc_config, api):
|
|
||||||
test_config = get_test_config_from_config_file(metafunc_config)
|
|
||||||
if test_config is None:
|
|
||||||
return None
|
|
||||||
return getattr(test_config, api)
|
|
||||||
|
|
||||||
|
|
||||||
def get_provider_fixture_overrides_from_test_config(metafunc_config, api, default_provider_fixture_combinations):
|
|
||||||
api_config = get_test_config_for_api(metafunc_config, api)
|
|
||||||
if api_config is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
fixture_combo_ids = set()
|
|
||||||
custom_provider_fixture_combos = []
|
|
||||||
for scenario in api_config.scenarios:
|
|
||||||
if scenario.fixture_combo_id:
|
|
||||||
fixture_combo_ids.add(scenario.fixture_combo_id)
|
|
||||||
else:
|
|
||||||
custom_provider_fixture_combos.append(
|
|
||||||
pytest.param(
|
|
||||||
scenario.provider_fixtures,
|
|
||||||
id=scenario.provider_fixtures.get("inference") or "",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(fixture_combo_ids) > 0:
|
|
||||||
for default_fixture in default_provider_fixture_combinations:
|
|
||||||
if default_fixture.id in fixture_combo_ids:
|
|
||||||
custom_provider_fixture_combos.append(default_fixture)
|
|
||||||
return custom_provider_fixture_combos
|
|
||||||
|
|
||||||
|
|
||||||
def remote_stack_fixture() -> ProviderFixture:
|
|
||||||
if url := os.getenv("REMOTE_STACK_URL", None):
|
|
||||||
config = RemoteProviderConfig.from_url(url)
|
|
||||||
else:
|
|
||||||
config = RemoteProviderConfig(
|
|
||||||
host=get_env_or_fail("REMOTE_STACK_HOST"),
|
|
||||||
port=int(get_env_or_fail("REMOTE_STACK_PORT")),
|
|
||||||
)
|
|
||||||
return ProviderFixture(
|
|
||||||
providers=[
|
|
||||||
Provider(
|
|
||||||
provider_id="test::remote",
|
|
||||||
provider_type="test::remote",
|
|
||||||
config=config.model_dump(),
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_configure(config):
|
|
||||||
config.option.tbstyle = "short"
|
|
||||||
config.option.disable_warnings = True
|
|
||||||
|
|
||||||
"""Load environment variables at start of test run"""
|
|
||||||
# Load from .env file if it exists
|
|
||||||
env_file = Path(__file__).parent / ".env"
|
|
||||||
if env_file.exists():
|
|
||||||
load_dotenv(env_file)
|
|
||||||
|
|
||||||
# Load any environment variables passed via --env
|
|
||||||
env_vars = config.getoption("--env") or []
|
|
||||||
for env_var in env_vars:
|
|
||||||
key, value = env_var.split("=", 1)
|
|
||||||
os.environ[key] = value
|
|
||||||
|
|
||||||
if config.getoption("--output") is not None:
|
|
||||||
config.pluginmanager.register(Report(config.getoption("--output")))
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--providers",
|
|
||||||
default="",
|
|
||||||
help=(
|
|
||||||
"Provider configuration in format: api1=provider1,api2=provider2. "
|
|
||||||
"Example: --providers inference=ollama,safety=meta-reference"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--config",
|
|
||||||
action="store",
|
|
||||||
help="Set test config file (supported format: YAML), e.g. --config=test_config.yml",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--output",
|
|
||||||
action="store",
|
|
||||||
help="Set output file for test report, e.g. --output=pytest_report.md",
|
|
||||||
)
|
|
||||||
"""Add custom command line options"""
|
|
||||||
parser.addoption("--env", action="append", help="Set environment variables, e.g. --env KEY=value")
|
|
||||||
parser.addoption(
|
|
||||||
"--inference-model",
|
|
||||||
action="store",
|
|
||||||
default="meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
help="Specify the inference model to use for testing",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--safety-shield",
|
|
||||||
action="store",
|
|
||||||
default="meta-llama/Llama-Guard-3-1B",
|
|
||||||
help="Specify the safety shield to use for testing",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--embedding-model",
|
|
||||||
action="store",
|
|
||||||
default=None,
|
|
||||||
help="Specify the embedding model to use for testing",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--judge-model",
|
|
||||||
action="store",
|
|
||||||
default="meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
help="Specify the judge model to use for testing",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def make_provider_id(providers: dict[str, str]) -> str:
|
|
||||||
return ":".join(f"{api}={provider}" for api, provider in sorted(providers.items()))
|
|
||||||
|
|
||||||
|
|
||||||
def get_provider_marks(providers: dict[str, str]) -> list[Any]:
|
|
||||||
marks = []
|
|
||||||
for provider in providers.values():
|
|
||||||
marks.append(getattr(pytest.mark, provider))
|
|
||||||
return marks
|
|
||||||
|
|
||||||
|
|
||||||
def get_provider_fixture_overrides(config, available_fixtures: dict[str, list[str]]) -> list[pytest.param] | None:
|
|
||||||
provider_str = config.getoption("--providers")
|
|
||||||
if not provider_str:
|
|
||||||
return None
|
|
||||||
|
|
||||||
fixture_dict = parse_fixture_string(provider_str, available_fixtures)
|
|
||||||
return [
|
|
||||||
pytest.param(
|
|
||||||
fixture_dict,
|
|
||||||
id=make_provider_id(fixture_dict),
|
|
||||||
marks=get_provider_marks(fixture_dict),
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def parse_fixture_string(provider_str: str, available_fixtures: dict[str, list[str]]) -> dict[str, str]:
|
|
||||||
"""Parse provider string of format 'api1=provider1,api2=provider2'"""
|
|
||||||
if not provider_str:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
fixtures = {}
|
|
||||||
pairs = provider_str.split(",")
|
|
||||||
for pair in pairs:
|
|
||||||
if "=" not in pair:
|
|
||||||
raise ValueError(f"Invalid provider specification: {pair}. Expected format: api=provider")
|
|
||||||
api, fixture = pair.split("=")
|
|
||||||
if api not in available_fixtures:
|
|
||||||
raise ValueError(f"Unknown API: {api}. Available APIs: {list(available_fixtures.keys())}")
|
|
||||||
if fixture not in available_fixtures[api]:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown provider '{fixture}' for API '{api}'. Available providers: {list(available_fixtures[api])}"
|
|
||||||
)
|
|
||||||
fixtures[api] = fixture
|
|
||||||
|
|
||||||
# Check that all provided APIs are supported
|
|
||||||
for api in available_fixtures.keys():
|
|
||||||
if api not in fixtures:
|
|
||||||
raise ValueError(
|
|
||||||
f"Missing provider fixture for API '{api}'. Available providers: {list(available_fixtures[api])}"
|
|
||||||
)
|
|
||||||
return fixtures
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_itemcollected(item):
|
|
||||||
# Get all markers as a list
|
|
||||||
filtered = ("asyncio", "parametrize")
|
|
||||||
marks = [mark.name for mark in item.iter_markers() if mark.name not in filtered]
|
|
||||||
if marks:
|
|
||||||
marks = colored(",".join(marks), "yellow")
|
|
||||||
item.name = f"{item.name}[{marks}]"
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(session, config, items):
|
|
||||||
test_config = get_test_config_from_config_file(config)
|
|
||||||
if test_config is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
required_tests = defaultdict(set)
|
|
||||||
for api_test_config in [
|
|
||||||
test_config.inference,
|
|
||||||
test_config.memory,
|
|
||||||
test_config.agents,
|
|
||||||
]:
|
|
||||||
if api_test_config is None:
|
|
||||||
continue
|
|
||||||
for test in api_test_config.tests:
|
|
||||||
arr = test.split("::")
|
|
||||||
if len(arr) != 2:
|
|
||||||
raise ValueError(f"Invalid format for test name {test}")
|
|
||||||
test_path, func_name = arr
|
|
||||||
required_tests[Path(__file__).parent / test_path].add(func_name)
|
|
||||||
|
|
||||||
new_items, deselected_items = [], []
|
|
||||||
for item in items:
|
|
||||||
func_name = getattr(item, "originalname", item.name)
|
|
||||||
if func_name in required_tests[item.fspath]:
|
|
||||||
new_items.append(item)
|
|
||||||
continue
|
|
||||||
deselected_items.append(item)
|
|
||||||
|
|
||||||
items[:] = new_items
|
|
||||||
config.hook.pytest_deselected(items=deselected_items)
|
|
||||||
|
|
||||||
|
|
||||||
pytest_plugins = [
|
|
||||||
"llama_stack.providers.tests.inference.fixtures",
|
|
||||||
"llama_stack.providers.tests.safety.fixtures",
|
|
||||||
"llama_stack.providers.tests.vector_io.fixtures",
|
|
||||||
"llama_stack.providers.tests.agents.fixtures",
|
|
||||||
"llama_stack.providers.tests.datasetio.fixtures",
|
|
||||||
"llama_stack.providers.tests.scoring.fixtures",
|
|
||||||
"llama_stack.providers.tests.eval.fixtures",
|
|
||||||
"llama_stack.providers.tests.post_training.fixtures",
|
|
||||||
"llama_stack.providers.tests.tools.fixtures",
|
|
||||||
]
|
|
|
@ -1,176 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pytest import ExitCode
|
|
||||||
from pytest_html.basereport import _process_outcome
|
|
||||||
|
|
||||||
from llama_stack.models.llama.sku_list import all_registered_models
|
|
||||||
from llama_stack.models.llama.sku_types import CoreModelId
|
|
||||||
|
|
||||||
INFERENCE_APIS = ["chat_completion"]
|
|
||||||
FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
|
|
||||||
SUPPORTED_MODELS = {
|
|
||||||
"ollama": {
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_1b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_1b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_3_70b_instruct.value,
|
|
||||||
CoreModelId.llama_guard_3_8b.value,
|
|
||||||
CoreModelId.llama_guard_3_1b.value,
|
|
||||||
},
|
|
||||||
"fireworks": {
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_1b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_3_70b_instruct.value,
|
|
||||||
CoreModelId.llama_guard_3_8b.value,
|
|
||||||
CoreModelId.llama_guard_3_11b_vision.value,
|
|
||||||
},
|
|
||||||
"together": {
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_3_70b_instruct.value,
|
|
||||||
CoreModelId.llama_guard_3_8b.value,
|
|
||||||
CoreModelId.llama_guard_3_11b_vision.value,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
|
||||||
def __init__(self, output_path):
|
|
||||||
valid_file_format = (
|
|
||||||
output_path.split(".")[1] in ["md", "markdown"] if len(output_path.split(".")) == 2 else False
|
|
||||||
)
|
|
||||||
if not valid_file_format:
|
|
||||||
raise ValueError(f"Invalid output file {output_path}. Markdown file is required")
|
|
||||||
self.output_path = output_path
|
|
||||||
self.test_data = defaultdict(dict)
|
|
||||||
self.inference_tests = defaultdict(dict)
|
|
||||||
|
|
||||||
@pytest.hookimpl
|
|
||||||
def pytest_runtest_logreport(self, report):
|
|
||||||
# This hook is called in several phases, including setup, call and teardown
|
|
||||||
# The test is considered failed / error if any of the outcomes is not "Passed"
|
|
||||||
outcome = _process_outcome(report)
|
|
||||||
data = {
|
|
||||||
"outcome": report.outcome,
|
|
||||||
"longrepr": report.longrepr,
|
|
||||||
"name": report.nodeid,
|
|
||||||
}
|
|
||||||
if report.nodeid not in self.test_data:
|
|
||||||
self.test_data[report.nodeid] = data
|
|
||||||
elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
|
|
||||||
self.test_data[report.nodeid] = data
|
|
||||||
|
|
||||||
@pytest.hookimpl
|
|
||||||
def pytest_sessionfinish(self, session, exitstatus):
|
|
||||||
if exitstatus <= ExitCode.INTERRUPTED:
|
|
||||||
return
|
|
||||||
report = []
|
|
||||||
report.append("# Llama Stack Integration Test Results Report")
|
|
||||||
report.append("\n## Summary")
|
|
||||||
report.append("\n## Supported Models: ")
|
|
||||||
|
|
||||||
header = "| Model Descriptor |"
|
|
||||||
dividor = "|:---|"
|
|
||||||
for k in SUPPORTED_MODELS.keys():
|
|
||||||
header += f"{k} |"
|
|
||||||
dividor += ":---:|"
|
|
||||||
|
|
||||||
report.append(header)
|
|
||||||
report.append(dividor)
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
for model in all_registered_models():
|
|
||||||
if "Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value:
|
|
||||||
continue
|
|
||||||
row = f"| {model.core_model_id.value} |"
|
|
||||||
for k in SUPPORTED_MODELS.keys():
|
|
||||||
if model.core_model_id.value in SUPPORTED_MODELS[k]:
|
|
||||||
row += " ✅ |"
|
|
||||||
else:
|
|
||||||
row += " ❌ |"
|
|
||||||
rows.append(row)
|
|
||||||
report.extend(rows)
|
|
||||||
|
|
||||||
report.append("\n### Tests:")
|
|
||||||
|
|
||||||
for provider in SUPPORTED_MODELS.keys():
|
|
||||||
if provider not in self.inference_tests:
|
|
||||||
continue
|
|
||||||
report.append(f"\n #### {provider}")
|
|
||||||
test_table = [
|
|
||||||
"| Area | Model | API | Functionality Test | Status |",
|
|
||||||
"|:-----|:-----|:-----|:-----|:-----|",
|
|
||||||
]
|
|
||||||
for api in INFERENCE_APIS:
|
|
||||||
tests = self.inference_tests[provider][api]
|
|
||||||
for test_nodeid in tests:
|
|
||||||
row = "|{area} | {model} | {api} | {test} | {result} ".format(
|
|
||||||
area="Text" if "text" in test_nodeid else "Vision",
|
|
||||||
model=("Llama-3.1-8B-Instruct" if "text" in test_nodeid else "Llama3.2-11B-Vision-Instruct"),
|
|
||||||
api=f"/{api}",
|
|
||||||
test=self.get_simple_function_name(test_nodeid),
|
|
||||||
result=("✅" if self.test_data[test_nodeid]["outcome"] == "passed" else "❌"),
|
|
||||||
)
|
|
||||||
test_table += [row]
|
|
||||||
report.extend(test_table)
|
|
||||||
report.append("\n")
|
|
||||||
|
|
||||||
output_file = Path(self.output_path)
|
|
||||||
output_file.write_text("\n".join(report))
|
|
||||||
print(f"\n Report generated: {output_file.absolute()}")
|
|
||||||
|
|
||||||
@pytest.hookimpl(trylast=True)
|
|
||||||
def pytest_collection_modifyitems(self, session, config, items):
|
|
||||||
for item in items:
|
|
||||||
inference = item.callspec.params.get("inference_stack")
|
|
||||||
if "inference" in item.nodeid:
|
|
||||||
func_name = getattr(item, "originalname", item.name)
|
|
||||||
for api in INFERENCE_APIS:
|
|
||||||
if api in func_name:
|
|
||||||
api_tests = self.inference_tests[inference].get(api, set())
|
|
||||||
api_tests.add(item.nodeid)
|
|
||||||
self.inference_tests[inference][api] = api_tests
|
|
||||||
|
|
||||||
def get_simple_function_name(self, nodeid):
|
|
||||||
"""Extract function name from nodeid.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
- 'tests/test_math.py::test_addition' -> 'test_addition'
|
|
||||||
- 'tests/test_math.py::TestClass::test_method' -> test_method'
|
|
||||||
"""
|
|
||||||
parts = nodeid.split("::")
|
|
||||||
func_name = nodeid # Fallback to full nodeid if pattern doesn't match
|
|
||||||
if len(parts) == 2: # Simple function
|
|
||||||
func_name = parts[1]
|
|
||||||
elif len(parts) == 3: # Class method
|
|
||||||
func_name = parts[2]
|
|
||||||
return func_name.split("[")[0]
|
|
|
@ -1,43 +0,0 @@
|
||||||
# Report for cerebras distribution
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
| Model Descriptor | cerebras |
|
|
||||||
|:---|:---|
|
|
||||||
| meta-llama/Llama-3-8B-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3-70B-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.1-8B-Instruct | ✅ |
|
|
||||||
| meta-llama/Llama-3.1-70B-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ |
|
|
||||||
| meta-llama/Llama-3.2-1B-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.2-3B-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ |
|
|
||||||
| meta-llama/Llama-3.3-70B-Instruct | ✅ |
|
|
||||||
| meta-llama/Llama-Guard-3-11B-Vision | ❌ |
|
|
||||||
| meta-llama/Llama-Guard-3-1B | ❌ |
|
|
||||||
| meta-llama/Llama-Guard-3-8B | ❌ |
|
|
||||||
| meta-llama/Llama-Guard-2-8B | ❌ |
|
|
||||||
|
|
||||||
## Inference
|
|
||||||
| Model | API | Capability | Test | Status |
|
|
||||||
|:----- |:-----|:-----|:-----|:-----|
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ❌ |
|
|
||||||
|
|
||||||
## Vector IO
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /retrieve | | test_vector_db_retrieve | ✅ |
|
|
||||||
|
|
||||||
## Agents
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /create_agent_turn | rag | test_rag_agent | ✅ |
|
|
||||||
| /create_agent_turn | custom_tool | test_custom_tool | ❌ |
|
|
|
@ -1,45 +0,0 @@
|
||||||
# Report for fireworks distribution
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
| Model Descriptor | fireworks |
|
|
||||||
|:---|:---|
|
|
||||||
| Llama-3-8B-Instruct | ❌ |
|
|
||||||
| Llama-3-70B-Instruct | ❌ |
|
|
||||||
| Llama3.1-8B-Instruct | ✅ |
|
|
||||||
| Llama3.1-70B-Instruct | ✅ |
|
|
||||||
| Llama3.1-405B-Instruct | ✅ |
|
|
||||||
| Llama3.2-1B-Instruct | ✅ |
|
|
||||||
| Llama3.2-3B-Instruct | ✅ |
|
|
||||||
| Llama3.2-11B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.2-90B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.3-70B-Instruct | ✅ |
|
|
||||||
| Llama-Guard-3-11B-Vision | ✅ |
|
|
||||||
| Llama-Guard-3-1B | ❌ |
|
|
||||||
| Llama-Guard-3-8B | ✅ |
|
|
||||||
| Llama-Guard-2-8B | ❌ |
|
|
||||||
|
|
||||||
## Inference
|
|
||||||
| Model | API | Capability | Test | Status |
|
|
||||||
|:----- |:-----|:-----|:-----|:-----|
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
|
|
||||||
|
|
||||||
## Vector IO
|
|
||||||
| Provider | API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|:-----|
|
|
||||||
| inline::faiss | /retrieve | | test_vector_db_retrieve | ✅ |
|
|
||||||
|
|
||||||
## Agents
|
|
||||||
| Provider | API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|:-----|
|
|
||||||
| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
|
|
||||||
| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
|
|
|
@ -1,43 +0,0 @@
|
||||||
# Report for ollama distribution
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
| Model Descriptor | ollama |
|
|
||||||
|:---|:---|
|
|
||||||
| Llama-3-8B-Instruct | ❌ |
|
|
||||||
| Llama-3-70B-Instruct | ❌ |
|
|
||||||
| Llama3.1-8B-Instruct | ✅ |
|
|
||||||
| Llama3.1-70B-Instruct | ✅ |
|
|
||||||
| Llama3.1-405B-Instruct | ✅ |
|
|
||||||
| Llama3.2-1B-Instruct | ✅ |
|
|
||||||
| Llama3.2-3B-Instruct | ✅ |
|
|
||||||
| Llama3.2-11B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.2-90B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.3-70B-Instruct | ✅ |
|
|
||||||
| Llama-Guard-3-11B-Vision | ❌ |
|
|
||||||
| Llama-Guard-3-1B | ✅ |
|
|
||||||
| Llama-Guard-3-8B | ✅ |
|
|
||||||
| Llama-Guard-2-8B | ❌ |
|
|
||||||
|
|
||||||
## Inference
|
|
||||||
| Model | API | Capability | Test | Status |
|
|
||||||
|:----- |:-----|:-----|:-----|:-----|
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
|
|
||||||
|
|
||||||
## Vector IO
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /retrieve | | test_vector_db_retrieve | ✅ |
|
|
||||||
|
|
||||||
## Agents
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /create_agent_turn | rag | test_rag_agent | ✅ |
|
|
||||||
| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
|
|
|
@ -1,44 +0,0 @@
|
||||||
# Report for tgi distribution
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
| Model Descriptor | tgi |
|
|
||||||
|:---|:---|
|
|
||||||
| Llama-3-8B-Instruct | ✅ |
|
|
||||||
| Llama-3-70B-Instruct | ✅ |
|
|
||||||
| Llama3.1-8B-Instruct | ✅ |
|
|
||||||
| Llama3.1-70B-Instruct | ✅ |
|
|
||||||
| Llama3.1-405B-Instruct | ✅ |
|
|
||||||
| Llama3.2-1B-Instruct | ✅ |
|
|
||||||
| Llama3.2-3B-Instruct | ✅ |
|
|
||||||
| Llama3.2-11B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.2-90B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.3-70B-Instruct | ✅ |
|
|
||||||
| Llama-Guard-3-11B-Vision | ✅ |
|
|
||||||
| Llama-Guard-3-1B | ✅ |
|
|
||||||
| Llama-Guard-3-8B | ✅ |
|
|
||||||
| Llama-Guard-2-8B | ✅ |
|
|
||||||
|
|
||||||
## Inference
|
|
||||||
| Model | API | Capability | Test | Status |
|
|
||||||
|:----- |:-----|:-----|:-----|:-----|
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
|
|
||||||
|
|
||||||
## Vector IO
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /retrieve | | test_vector_db_retrieve | ✅ |
|
|
||||||
|
|
||||||
## Agents
|
|
||||||
| API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|
|
|
||||||
| /create_agent_turn | rag | test_rag_agent | ✅ |
|
|
||||||
| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
|
|
||||||
| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
|
|
|
@ -1,45 +0,0 @@
|
||||||
# Report for together distribution
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
| Model Descriptor | together |
|
|
||||||
|:---|:---|
|
|
||||||
| Llama-3-8B-Instruct | ❌ |
|
|
||||||
| Llama-3-70B-Instruct | ❌ |
|
|
||||||
| Llama3.1-8B-Instruct | ✅ |
|
|
||||||
| Llama3.1-70B-Instruct | ✅ |
|
|
||||||
| Llama3.1-405B-Instruct | ✅ |
|
|
||||||
| Llama3.2-1B-Instruct | ❌ |
|
|
||||||
| Llama3.2-3B-Instruct | ✅ |
|
|
||||||
| Llama3.2-11B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.2-90B-Vision-Instruct | ✅ |
|
|
||||||
| Llama3.3-70B-Instruct | ✅ |
|
|
||||||
| Llama-Guard-3-11B-Vision | ✅ |
|
|
||||||
| Llama-Guard-3-1B | ❌ |
|
|
||||||
| Llama-Guard-3-8B | ✅ |
|
|
||||||
| Llama-Guard-2-8B | ❌ |
|
|
||||||
|
|
||||||
## Inference
|
|
||||||
| Model | API | Capability | Test | Status |
|
|
||||||
|:----- |:-----|:-----|:-----|:-----|
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
|
|
||||||
| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
|
|
||||||
| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
|
|
||||||
|
|
||||||
## Vector IO
|
|
||||||
| Provider | API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|:-----|
|
|
||||||
| inline::faiss | /retrieve | | test_vector_db_retrieve | ✅ |
|
|
||||||
|
|
||||||
## Agents
|
|
||||||
| Provider | API | Capability | Test | Status |
|
|
||||||
|:-----|:-----|:-----|:-----|:-----|
|
|
||||||
| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
|
|
||||||
| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
|
|
|
@ -28,7 +28,6 @@ if no model is specified.
|
||||||
|
|
||||||
Experimental, under development, options:
|
Experimental, under development, options:
|
||||||
- `--record-responses`: record new API responses instead of using cached ones
|
- `--record-responses`: record new API responses instead of using cached ones
|
||||||
- `--report`: path where the test report should be written, e.g. --report=/path/to/report.md
|
|
||||||
|
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
|
@ -15,8 +15,6 @@ from dotenv import load_dotenv
|
||||||
|
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .report import Report
|
|
||||||
|
|
||||||
logger = get_logger(__name__, category="tests")
|
logger = get_logger(__name__, category="tests")
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,9 +58,6 @@ def pytest_configure(config):
|
||||||
os.environ["DISABLE_CODE_SANDBOX"] = "1"
|
os.environ["DISABLE_CODE_SANDBOX"] = "1"
|
||||||
logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
|
logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
|
||||||
|
|
||||||
if config.getoption("--report"):
|
|
||||||
config.pluginmanager.register(Report(config))
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
|
|
|
@ -1,54 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from llama_stack.providers.datatypes import Api
|
|
||||||
|
|
||||||
INFERENCE_API_CAPA_TEST_MAP = {
|
|
||||||
"chat_completion": {
|
|
||||||
"streaming": [
|
|
||||||
"test_text_chat_completion_streaming",
|
|
||||||
"test_image_chat_completion_streaming",
|
|
||||||
],
|
|
||||||
"non_streaming": [
|
|
||||||
"test_image_chat_completion_non_streaming",
|
|
||||||
"test_text_chat_completion_non_streaming",
|
|
||||||
],
|
|
||||||
"tool_calling": [
|
|
||||||
"test_text_chat_completion_with_tool_calling_and_streaming",
|
|
||||||
"test_text_chat_completion_with_tool_calling_and_non_streaming",
|
|
||||||
],
|
|
||||||
"log_probs": [
|
|
||||||
"test_completion_log_probs_non_streaming",
|
|
||||||
"test_completion_log_probs_streaming",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"completion": {
|
|
||||||
"streaming": ["test_text_completion_streaming"],
|
|
||||||
"non_streaming": ["test_text_completion_non_streaming"],
|
|
||||||
"structured_output": ["test_text_completion_structured_output"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
VECTORIO_API_TEST_MAP = {
|
|
||||||
"retrieve": {
|
|
||||||
"": ["test_vector_db_retrieve"],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
AGENTS_API_TEST_MAP = {
|
|
||||||
"create_agent_turn": {
|
|
||||||
"rag": ["test_rag_agent"],
|
|
||||||
"custom_tool": ["test_custom_tool"],
|
|
||||||
"code_execution": ["test_code_interpreter_for_attachments"],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
API_MAPS = {
|
|
||||||
Api.inference: INFERENCE_API_CAPA_TEST_MAP,
|
|
||||||
Api.vector_io: VECTORIO_API_TEST_MAP,
|
|
||||||
Api.agents: AGENTS_API_TEST_MAP,
|
|
||||||
}
|
|
|
@ -1,220 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pytest import CollectReport
|
|
||||||
from termcolor import cprint
|
|
||||||
|
|
||||||
from llama_stack.models.llama.sku_list import (
|
|
||||||
all_registered_models,
|
|
||||||
llama3_1_instruct_models,
|
|
||||||
llama3_2_instruct_models,
|
|
||||||
llama3_3_instruct_models,
|
|
||||||
llama3_instruct_models,
|
|
||||||
safety_models,
|
|
||||||
)
|
|
||||||
from llama_stack.models.llama.sku_types import CoreModelId
|
|
||||||
from llama_stack.providers.datatypes import Api
|
|
||||||
|
|
||||||
from .metadata import API_MAPS
|
|
||||||
|
|
||||||
|
|
||||||
def featured_models():
|
|
||||||
models = [
|
|
||||||
*llama3_instruct_models(),
|
|
||||||
*llama3_1_instruct_models(),
|
|
||||||
*llama3_2_instruct_models(),
|
|
||||||
*llama3_3_instruct_models(),
|
|
||||||
*safety_models(),
|
|
||||||
]
|
|
||||||
return {model.huggingface_repo: model for model in models if not model.variant}
|
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_MODELS = {
|
|
||||||
"ollama": {
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_70b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_1_405b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_1b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_1b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_3b_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_11b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
|
||||||
CoreModelId.llama3_3_70b_instruct.value,
|
|
||||||
CoreModelId.llama_guard_3_8b.value,
|
|
||||||
CoreModelId.llama_guard_3_1b.value,
|
|
||||||
},
|
|
||||||
"tgi": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
|
|
||||||
"vllm": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
|
||||||
def __init__(self, config):
|
|
||||||
self.distro_name = None
|
|
||||||
self.config = config
|
|
||||||
self.output_path = Path(config.getoption("--report")) if config.getoption("--report") else None
|
|
||||||
|
|
||||||
stack_config = self.config.getoption("--stack-config")
|
|
||||||
if stack_config:
|
|
||||||
is_url = stack_config.startswith("http") or "//" in stack_config
|
|
||||||
is_yaml = stack_config.endswith(".yaml")
|
|
||||||
if not is_url and not is_yaml:
|
|
||||||
self.distro_name = stack_config
|
|
||||||
|
|
||||||
self.report_data = defaultdict(dict)
|
|
||||||
# test function -> test nodeid
|
|
||||||
self.test_data = dict()
|
|
||||||
self.test_name_to_nodeid = defaultdict(list)
|
|
||||||
self.vision_model_id = None
|
|
||||||
self.text_model_id = None
|
|
||||||
self.client = None
|
|
||||||
|
|
||||||
@pytest.hookimpl(tryfirst=True)
|
|
||||||
def pytest_runtest_logreport(self, report):
|
|
||||||
# This hook is called in several phases, including setup, call and teardown
|
|
||||||
# The test is considered failed / error if any of the outcomes is not "Passed"
|
|
||||||
outcome = self._process_outcome(report)
|
|
||||||
if report.nodeid not in self.test_data:
|
|
||||||
self.test_data[report.nodeid] = outcome
|
|
||||||
elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
|
|
||||||
self.test_data[report.nodeid] = outcome
|
|
||||||
|
|
||||||
def pytest_sessionfinish(self, session):
|
|
||||||
if not self.client:
|
|
||||||
return
|
|
||||||
|
|
||||||
report = []
|
|
||||||
report.append(f"# Report for {self.distro_name} distribution")
|
|
||||||
report.append("\n## Supported Models")
|
|
||||||
|
|
||||||
header = f"| Model Descriptor | {self.distro_name} |"
|
|
||||||
dividor = "|:---|:---|"
|
|
||||||
|
|
||||||
report.append(header)
|
|
||||||
report.append(dividor)
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
if self.distro_name in SUPPORTED_MODELS:
|
|
||||||
for model in all_registered_models():
|
|
||||||
if ("Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value) or (
|
|
||||||
model.variant
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
row = f"| {model.core_model_id.value} |"
|
|
||||||
if model.core_model_id.value in SUPPORTED_MODELS[self.distro_name]:
|
|
||||||
row += " ✅ |"
|
|
||||||
else:
|
|
||||||
row += " ❌ |"
|
|
||||||
rows.append(row)
|
|
||||||
else:
|
|
||||||
supported_models = {m.identifier for m in self.client.models.list()}
|
|
||||||
for hf_name, model in featured_models().items():
|
|
||||||
row = f"| {model.core_model_id.value} |"
|
|
||||||
if hf_name in supported_models:
|
|
||||||
row += " ✅ |"
|
|
||||||
else:
|
|
||||||
row += " ❌ |"
|
|
||||||
rows.append(row)
|
|
||||||
report.extend(rows)
|
|
||||||
|
|
||||||
report.append("\n## Inference")
|
|
||||||
test_table = [
|
|
||||||
"| Model | API | Capability | Test | Status |",
|
|
||||||
"|:----- |:-----|:-----|:-----|:-----|",
|
|
||||||
]
|
|
||||||
for api, capa_map in API_MAPS[Api.inference].items():
|
|
||||||
for capa, tests in capa_map.items():
|
|
||||||
for test_name in tests:
|
|
||||||
model_id = self.text_model_id if "text" in test_name else self.vision_model_id
|
|
||||||
test_nodeids = self.test_name_to_nodeid[test_name]
|
|
||||||
if not test_nodeids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# There might be more than one parametrizations for the same test function. We take
|
|
||||||
# the result of the first one for now. Ideally we should mark the test as failed if
|
|
||||||
# any of the parametrizations failed.
|
|
||||||
test_table.append(
|
|
||||||
f"| {model_id} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
|
|
||||||
)
|
|
||||||
|
|
||||||
report.extend(test_table)
|
|
||||||
|
|
||||||
name_map = {Api.vector_io: "Vector IO", Api.agents: "Agents"}
|
|
||||||
providers = self.client.providers.list()
|
|
||||||
for api_group in [Api.vector_io, Api.agents]:
|
|
||||||
api_capitalized = name_map[api_group]
|
|
||||||
report.append(f"\n## {api_capitalized}")
|
|
||||||
test_table = [
|
|
||||||
"| Provider | API | Capability | Test | Status |",
|
|
||||||
"|:-----|:-----|:-----|:-----|:-----|",
|
|
||||||
]
|
|
||||||
provider = [p for p in providers if p.api == str(api_group.name)]
|
|
||||||
provider_str = ",".join(str(p) for p in provider) if provider else ""
|
|
||||||
for api, capa_map in API_MAPS[api_group].items():
|
|
||||||
for capa, tests in capa_map.items():
|
|
||||||
for test_name in tests:
|
|
||||||
test_nodeids = self.test_name_to_nodeid[test_name]
|
|
||||||
if not test_nodeids:
|
|
||||||
continue
|
|
||||||
test_table.append(
|
|
||||||
f"| {provider_str} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
|
|
||||||
)
|
|
||||||
report.extend(test_table)
|
|
||||||
|
|
||||||
output_file = self.output_path
|
|
||||||
text = "\n".join(report) + "\n"
|
|
||||||
output_file.write_text(text)
|
|
||||||
cprint(f"\nReport generated: {output_file.absolute()}", "green")
|
|
||||||
|
|
||||||
def pytest_runtest_makereport(self, item, call):
|
|
||||||
func_name = getattr(item, "originalname", item.name)
|
|
||||||
self.test_name_to_nodeid[func_name].append(item.nodeid)
|
|
||||||
|
|
||||||
# Get values from fixtures for report output
|
|
||||||
if model_id := item.funcargs.get("text_model_id"):
|
|
||||||
parts = model_id.split("/")
|
|
||||||
text_model = parts[1] if len(parts) > 1 else model_id
|
|
||||||
self.text_model_id = self.text_model_id or text_model
|
|
||||||
elif model_id := item.funcargs.get("vision_model_id"):
|
|
||||||
parts = model_id.split("/")
|
|
||||||
vision_model = parts[1] if len(parts) > 1 else model_id
|
|
||||||
self.vision_model_id = self.vision_model_id or vision_model
|
|
||||||
|
|
||||||
if not self.client:
|
|
||||||
self.client = item.funcargs.get("llama_stack_client")
|
|
||||||
|
|
||||||
def _print_result_icon(self, result):
|
|
||||||
if result == "Passed":
|
|
||||||
return "✅"
|
|
||||||
elif result == "Failed" or result == "Error":
|
|
||||||
return "❌"
|
|
||||||
else:
|
|
||||||
# result == "Skipped":
|
|
||||||
return "⏭️"
|
|
||||||
|
|
||||||
def _process_outcome(self, report: CollectReport):
|
|
||||||
if self._is_error(report):
|
|
||||||
return "Error"
|
|
||||||
if hasattr(report, "wasxfail"):
|
|
||||||
if report.outcome in ["passed", "failed"]:
|
|
||||||
return "XPassed"
|
|
||||||
if report.outcome == "skipped":
|
|
||||||
return "XFailed"
|
|
||||||
return report.outcome.capitalize()
|
|
||||||
|
|
||||||
def _is_error(self, report: CollectReport):
|
|
||||||
return report.when in ["setup", "teardown", "collect"] and report.outcome == "failed"
|
|
Loading…
Add table
Add a link
Reference in a new issue