refactor(test): move tools, evals, datasetio, scoring and post training tests (#1401)

All of the tests from `llama_stack/providers/tests/` are now moved to `tests/integration`. I converted the `tools`, `scoring` and `datasetio` tests to use API. However, `eval` and `post_training` proved to be a bit challenging to leaving those. I think `post_training` should be relatively straightforward also. As part of this, I noticed that `wolfram_alpha` tool wasn't added to some of our commonly used distros so I added it. I am going to remove a lot of code duplication from distros next so while this looks like a one-off right now, it will go away and be there uniformly for all distros.
2025-06-28 02:53:30 +00:00 · 2025-03-04 14:53:47 -08:00 · 2025-03-04 14:53:47 -08:00 · abfbaf3c1b
commit abfbaf3c1b
parent dd0db8038b
51 changed files with 471 additions and 1245 deletions
--- a/llama_stack/providers/tests/datasetio/init.py
+++ b/llama_stack/providers/tests/datasetio/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/tests/datasetio/conftest.py
+++ b/llama_stack/providers/tests/datasetio/conftest.py
@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from .fixtures import DATASETIO_FIXTURES
-
-
-def pytest_configure(config):
-    for fixture_name in DATASETIO_FIXTURES:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "datasetio_stack" in metafunc.fixturenames:
-        metafunc.parametrize(
-            "datasetio_stack",
-            [
-                pytest.param(fixture_name, marks=getattr(pytest.mark, fixture_name))
-                for fixture_name in DATASETIO_FIXTURES
-            ],
-            indirect=True,
-        )
--- a/llama_stack/providers/tests/datasetio/fixtures.py
+++ b/llama_stack/providers/tests/datasetio/fixtures.py
@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-
-
-@pytest.fixture(scope="session")
-def datasetio_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def datasetio_localfs() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="localfs",
-                provider_type="inline::localfs",
-                config={},
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def datasetio_huggingface() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="huggingface",
-                provider_type="remote::huggingface",
-                config={},
-            )
-        ],
-    )
-
-
-DATASETIO_FIXTURES = ["localfs", "remote", "huggingface"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def datasetio_stack(request):
-    fixture_name = request.param
-    fixture = request.getfixturevalue(f"datasetio_{fixture_name}")
-
-    test_stack = await construct_stack_for_test(
-        [Api.datasetio],
-        {"datasetio": fixture.providers},
-        fixture.provider_data,
-    )
-
-    return test_stack.impls[Api.datasetio], test_stack.impls[Api.datasets]
--- a/llama_stack/providers/tests/datasetio/test_dataset.csv
+++ b/llama_stack/providers/tests/datasetio/test_dataset.csv
@ -1,6 +0,0 @@
-input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
-What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
-What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ b/llama_stack/providers/tests/datasetio/test_datasetio.py
@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import mimetypes
-import os
-from pathlib import Path
-
-import pytest
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
-from llama_stack.apis.datasets import Datasets
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/datasetio/test_datasetio.py
-#   -m "meta_reference"
-#   -v -s --tb=short --disable-warnings
-
-
-def data_url_from_file(file_path: str) -> str:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
-
-async def register_dataset(
-    datasets_impl: Datasets,
-    for_generation=False,
-    for_rag=False,
-    dataset_id="test_dataset",
-):
-    if for_rag:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv"
-    else:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
-    test_url = data_url_from_file(str(test_file))
-
-    if for_generation:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "chat_completion_input": ChatCompletionInputType(),
-        }
-    elif for_rag:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "generated_answer": StringType(),
-            "context": StringType(),
-        }
-    else:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "generated_answer": StringType(),
-        }
-
-    await datasets_impl.register_dataset(
-        dataset_id=dataset_id,
-        dataset_schema=dataset_schema,
-        url=URL(uri=test_url),
-    )
-
-
-class TestDatasetIO:
-    @pytest.mark.asyncio
-    async def test_datasets_list(self, datasetio_stack):
-        # NOTE: this needs you to ensure that you are starting from a clean state
-        # but so far we don't have an unregister API unfortunately, so be careful
-        _, datasets_impl = datasetio_stack
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 0
-
-    @pytest.mark.asyncio
-    async def test_register_dataset(self, datasetio_stack):
-        _, datasets_impl = datasetio_stack
-        await register_dataset(datasets_impl)
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 1
-        assert response[0].identifier == "test_dataset"
-
-        with pytest.raises(ValueError):
-            # unregister a dataset that does not exist
-            await datasets_impl.unregister_dataset("test_dataset2")
-
-        await datasets_impl.unregister_dataset("test_dataset")
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 0
-
-        with pytest.raises(ValueError):
-            await datasets_impl.unregister_dataset("test_dataset")
-
-    @pytest.mark.asyncio
-    async def test_get_rows_paginated(self, datasetio_stack):
-        datasetio_impl, datasets_impl = datasetio_stack
-        await register_dataset(datasets_impl)
-        response = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert isinstance(response.rows, list)
-        assert len(response.rows) == 3
-        assert response.next_page_token == "3"
-
-        provider = datasetio_impl.routing_table.get_provider_impl("test_dataset")
-        if provider.__provider_spec__.provider_type == "remote":
-            pytest.skip("remote provider doesn't support get_rows_paginated")
-
-        # iterate over all rows
-        response = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=2,
-            page_token=response.next_page_token,
-        )
-        assert isinstance(response.rows, list)
-        assert len(response.rows) == 2
-        assert response.next_page_token == "5"
--- a/llama_stack/providers/tests/datasetio/test_rag_dataset.csv
+++ b/llama_stack/providers/tests/datasetio/test_rag_dataset.csv
@ -1,6 +0,0 @@
-input_query,context,generated_answer,expected_answer
-What is the capital of France?,"France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum.",London,Paris
-Who is the CEO of Meta?,"Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies.",Mark Zuckerberg,Mark Zuckerberg
-What is the largest planet in our solar system?,"The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets.",Jupiter,Jupiter
-What is the smallest country in the world?,"Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy.",China,Vatican City
-What is the currency of Japan?,"Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871.",Yen,Yen
--- a/llama_stack/providers/tests/env.py
+++ b/llama_stack/providers/tests/env.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-
-class MissingCredentialError(Exception):
-    pass
-
-
-def get_env_or_fail(key: str) -> str:
-    """Get environment variable or raise helpful error"""
-    value = os.getenv(key)
-    if not value:
-        raise MissingCredentialError(
-            f"\nMissing {key} in environment. Please set it using one of these methods:"
-            f"\n1. Export in shell: export {key}=your-key"
-            f"\n2. Create .env file in project root with: {key}=your-key"
-            f"\n3. Pass directly to pytest: pytest --env {key}=your-key"
-        )
-    return value
--- a/llama_stack/providers/tests/eval/init.py
+++ b/llama_stack/providers/tests/eval/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/tests/eval/conftest.py
+++ b/llama_stack/providers/tests/eval/conftest.py
@ -1,92 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..agents.fixtures import AGENTS_FIXTURES
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from ..inference.fixtures import INFERENCE_FIXTURES
-from ..safety.fixtures import SAFETY_FIXTURES
-from ..scoring.fixtures import SCORING_FIXTURES
-from ..tools.fixtures import TOOL_RUNTIME_FIXTURES
-from ..vector_io.fixtures import VECTOR_IO_FIXTURES
-from .fixtures import EVAL_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "fireworks",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_fireworks_inference",
-        marks=pytest.mark.meta_reference_eval_fireworks_inference,
-    ),
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "together",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_together_inference",
-        marks=pytest.mark.meta_reference_eval_together_inference,
-    ),
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "huggingface",
-            "inference": "together",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_together_inference_huggingface_datasetio",
-        marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio,
-    ),
-]
-
-
-def pytest_configure(config):
-    for fixture_name in [
-        "meta_reference_eval_fireworks_inference",
-        "meta_reference_eval_together_inference",
-        "meta_reference_eval_together_inference_huggingface_datasetio",
-    ]:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "eval_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "eval": EVAL_FIXTURES,
-            "scoring": SCORING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-            "inference": INFERENCE_FIXTURES,
-            "agents": AGENTS_FIXTURES,
-            "safety": SAFETY_FIXTURES,
-            "vector_io": VECTOR_IO_FIXTURES,
-            "tool_runtime": TOOL_RUNTIME_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("eval_stack", combinations, indirect=True)
--- a/llama_stack/providers/tests/eval/constants.py
+++ b/llama_stack/providers/tests/eval/constants.py
@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-JUDGE_PROMPT = """
-You will be given a question, a expected_answer, and a system_answer.
-Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
-Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
-Provide your feedback as follows:
-Feedback:::
-Total rating: (your rating, as a int between 0 and 5)
-Now here are the question, expected_answer, system_answer.
-Question: {input_query}
-Expected Answer: {expected_answer}
-System Answer: {generated_answer}
-Feedback:::
-Total rating:
-"""
--- a/llama_stack/providers/tests/eval/fixtures.py
+++ b/llama_stack/providers/tests/eval/fixtures.py
@ -1,87 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.distribution.datatypes import Api, ModelInput, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-
-
-@pytest.fixture(scope="session")
-def eval_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def eval_meta_reference() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="meta-reference",
-                provider_type="inline::meta-reference",
-                config={},
-            )
-        ],
-    )
-
-
-EVAL_FIXTURES = ["meta_reference", "remote"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def eval_stack(
-    request,
-    inference_model,
-    judge_model,
-    tool_group_input_memory,
-    tool_group_input_tavily_search,
-):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in [
-        "datasetio",
-        "eval",
-        "scoring",
-        "inference",
-        "agents",
-        "safety",
-        "vector_io",
-        "tool_runtime",
-    ]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [
-            Api.eval,
-            Api.datasetio,
-            Api.inference,
-            Api.scoring,
-            Api.agents,
-            Api.safety,
-            Api.vector_io,
-            Api.tool_runtime,
-        ],
-        providers,
-        provider_data,
-        models=[
-            ModelInput(model_id=model)
-            for model in [
-                inference_model,
-                judge_model,
-            ]
-        ],
-        tool_groups=[tool_group_input_memory, tool_group_input_tavily_search],
-    )
-
-    return test_stack.impls
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@ -1,184 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
-from llama_stack.apis.eval.eval import (
-    AppBenchmarkConfig,
-    BenchmarkBenchmarkConfig,
-    ModelCandidate,
-)
-from llama_stack.apis.inference import SamplingParams
-from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams
-from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
-
-from .constants import JUDGE_PROMPT
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/eval/test_eval.py
-#   -m "meta_reference_eval_together_inference_huggingface_datasetio"
-#   -v -s --tb=short --disable-warnings
-
-
-class Testeval:
-    @pytest.mark.asyncio
-    async def test_benchmarks_list(self, eval_stack):
-        # NOTE: this needs you to ensure that you are starting from a clean state
-        # but so far we don't have an unregister API unfortunately, so be careful
-        benchmarks_impl = eval_stack[Api.benchmarks]
-        response = await benchmarks_impl.list_benchmarks()
-        assert isinstance(response, list)
-
-    @pytest.mark.asyncio
-    async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl = (
-            eval_stack[Api.eval],
-            eval_stack[Api.benchmarks],
-            eval_stack[Api.datasetio],
-            eval_stack[Api.datasets],
-        )
-
-        await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
-        response = await datasets_impl.list_datasets()
-
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset_for_eval",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_functions = [
-            "basic::equality",
-        ]
-        benchmark_id = "meta-reference::app_eval"
-        await benchmarks_impl.register_benchmark(
-            benchmark_id=benchmark_id,
-            dataset_id="test_dataset_for_eval",
-            scoring_functions=scoring_functions,
-        )
-        response = await eval_impl.evaluate_rows(
-            benchmark_id=benchmark_id,
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=AppBenchmarkConfig(
-                eval_candidate=ModelCandidate(
-                    model=inference_model,
-                    sampling_params=SamplingParams(),
-                ),
-                scoring_params={
-                    "meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams(
-                        judge_model=judge_model,
-                        prompt_template=JUDGE_PROMPT,
-                        judge_score_regexes=[
-                            r"Total rating: (\d+)",
-                            r"rating: (\d+)",
-                            r"Rating: (\d+)",
-                        ],
-                    )
-                },
-            ),
-        )
-        assert len(response.generations) == 3
-        assert "basic::equality" in response.scores
-
-    @pytest.mark.asyncio
-    async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, benchmarks_impl, datasets_impl = (
-            eval_stack[Api.eval],
-            eval_stack[Api.benchmarks],
-            eval_stack[Api.datasets],
-        )
-
-        await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
-
-        scoring_functions = [
-            "basic::subset_of",
-        ]
-
-        benchmark_id = "meta-reference::app_eval-2"
-        await benchmarks_impl.register_benchmark(
-            benchmark_id=benchmark_id,
-            dataset_id="test_dataset_for_eval",
-            scoring_functions=scoring_functions,
-        )
-        response = await eval_impl.run_eval(
-            benchmark_id=benchmark_id,
-            benchmark_config=AppBenchmarkConfig(
-                eval_candidate=ModelCandidate(
-                    model=inference_model,
-                    sampling_params=SamplingParams(),
-                ),
-            ),
-        )
-        assert response.job_id == "0"
-        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
-        assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
-
-        assert eval_response is not None
-        assert len(eval_response.generations) == 5
-        assert "basic::subset_of" in eval_response.scores
-
-    @pytest.mark.asyncio
-    async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, benchmarks_impl, datasets_impl = (
-            eval_stack[Api.eval],
-            eval_stack[Api.benchmarks],
-            eval_stack[Api.datasets],
-        )
-
-        response = await datasets_impl.list_datasets()
-        assert len(response) > 0
-        if response[0].provider_id != "huggingface":
-            pytest.skip("Only huggingface provider supports pre-registered remote datasets")
-
-        await datasets_impl.register_dataset(
-            dataset_id="mmlu",
-            dataset_schema={
-                "input_query": StringType(),
-                "expected_answer": StringType(),
-                "chat_completion_input": ChatCompletionInputType(),
-            },
-            url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
-            metadata={
-                "path": "llamastack/evals",
-                "name": "evals__mmlu__details",
-                "split": "train",
-            },
-        )
-
-        # register eval task
-        await benchmarks_impl.register_benchmark(
-            benchmark_id="meta-reference-mmlu",
-            dataset_id="mmlu",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-        )
-
-        # list benchmarks
-        response = await benchmarks_impl.list_benchmarks()
-        assert len(response) > 0
-
-        benchmark_id = "meta-reference-mmlu"
-        response = await eval_impl.run_eval(
-            benchmark_id=benchmark_id,
-            benchmark_config=BenchmarkBenchmarkConfig(
-                eval_candidate=ModelCandidate(
-                    model=inference_model,
-                    sampling_params=SamplingParams(),
-                ),
-                num_examples=3,
-            ),
-        )
-        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
-        assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
-        assert eval_response is not None
-        assert len(eval_response.generations) == 3
--- a/llama_stack/providers/tests/post_training/init.py
+++ b/llama_stack/providers/tests/post_training/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/tests/post_training/conftest.py
+++ b/llama_stack/providers/tests/post_training/conftest.py
@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from .fixtures import POST_TRAINING_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "post_training": "torchtune",
-            "datasetio": "huggingface",
-        },
-        id="torchtune_post_training_huggingface_datasetio",
-        marks=pytest.mark.torchtune_post_training_huggingface_datasetio,
-    ),
-]
-
-
-def pytest_configure(config):
-    combined_fixtures = "torchtune_post_training_huggingface_datasetio"
-    config.addinivalue_line(
-        "markers",
-        f"{combined_fixtures}: marks tests as {combined_fixtures} specific",
-    )
-
-
-def pytest_generate_tests(metafunc):
-    if "post_training_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "eval": POST_TRAINING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("post_training_stack", combinations, indirect=True)
--- a/llama_stack/providers/tests/post_training/fixtures.py
+++ b/llama_stack/providers/tests/post_training/fixtures.py
@ -1,72 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import StringType
-from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.models import ModelInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture
-
-
-@pytest.fixture(scope="session")
-def post_training_torchtune() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="torchtune",
-                provider_type="inline::torchtune",
-                config={},
-            )
-        ],
-    )
-
-
-POST_TRAINING_FIXTURES = ["torchtune"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def post_training_stack(request):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["post_training", "datasetio"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [Api.post_training, Api.datasetio],
-        providers,
-        provider_data,
-        models=[ModelInput(model_id="meta-llama/Llama-3.2-3B-Instruct")],
-        datasets=[
-            DatasetInput(
-                dataset_id="alpaca",
-                provider_id="huggingface",
-                url=URL(uri="https://huggingface.co/datasets/tatsu-lab/alpaca"),
-                metadata={
-                    "path": "tatsu-lab/alpaca",
-                    "split": "train",
-                },
-                dataset_schema={
-                    "instruction": StringType(),
-                    "input": StringType(),
-                    "output": StringType(),
-                    "text": StringType(),
-                },
-            ),
-        ],
-    )
-
-    return test_stack.impls[Api.post_training]
--- a/llama_stack/providers/tests/post_training/test_post_training.py
+++ b/llama_stack/providers/tests/post_training/test_post_training.py
@ -1,100 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import List
-
-import pytest
-
-from llama_stack.apis.common.job_types import JobStatus
-from llama_stack.apis.post_training import (
-    Checkpoint,
-    DataConfig,
-    LoraFinetuningConfig,
-    OptimizerConfig,
-    PostTrainingJob,
-    PostTrainingJobArtifactsResponse,
-    PostTrainingJobStatusResponse,
-    TrainingConfig,
-)
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/post_training/test_post_training.py
-#   -m "torchtune_post_training_huggingface_datasetio"
-#   -v -s --tb=short --disable-warnings
-
-
-class TestPostTraining:
-    @pytest.mark.asyncio
-    async def test_supervised_fine_tune(self, post_training_stack):
-        algorithm_config = LoraFinetuningConfig(
-            type="LoRA",
-            lora_attn_modules=["q_proj", "v_proj", "output_proj"],
-            apply_lora_to_mlp=True,
-            apply_lora_to_output=False,
-            rank=8,
-            alpha=16,
-        )
-
-        data_config = DataConfig(
-            dataset_id="alpaca",
-            batch_size=1,
-            shuffle=False,
-        )
-
-        optimizer_config = OptimizerConfig(
-            optimizer_type="adamw",
-            lr=3e-4,
-            lr_min=3e-5,
-            weight_decay=0.1,
-            num_warmup_steps=100,
-        )
-
-        training_config = TrainingConfig(
-            n_epochs=1,
-            data_config=data_config,
-            optimizer_config=optimizer_config,
-            max_steps_per_epoch=1,
-            gradient_accumulation_steps=1,
-        )
-        post_training_impl = post_training_stack
-        response = await post_training_impl.supervised_fine_tune(
-            job_uuid="1234",
-            model="Llama3.2-3B-Instruct",
-            algorithm_config=algorithm_config,
-            training_config=training_config,
-            hyperparam_search_config={},
-            logger_config={},
-            checkpoint_dir="null",
-        )
-        assert isinstance(response, PostTrainingJob)
-        assert response.job_uuid == "1234"
-
-    @pytest.mark.asyncio
-    async def test_get_training_jobs(self, post_training_stack):
-        post_training_impl = post_training_stack
-        jobs_list = await post_training_impl.get_training_jobs()
-        assert isinstance(jobs_list, List)
-        assert jobs_list[0].job_uuid == "1234"
-
-    @pytest.mark.asyncio
-    async def test_get_training_job_status(self, post_training_stack):
-        post_training_impl = post_training_stack
-        job_status = await post_training_impl.get_training_job_status("1234")
-        assert isinstance(job_status, PostTrainingJobStatusResponse)
-        assert job_status.job_uuid == "1234"
-        assert job_status.status == JobStatus.completed
-        assert isinstance(job_status.checkpoints[0], Checkpoint)
-
-    @pytest.mark.asyncio
-    async def test_get_training_job_artifacts(self, post_training_stack):
-        post_training_impl = post_training_stack
-        job_artifacts = await post_training_impl.get_training_job_artifacts("1234")
-        assert isinstance(job_artifacts, PostTrainingJobArtifactsResponse)
-        assert job_artifacts.job_uuid == "1234"
-        assert isinstance(job_artifacts.checkpoints[0], Checkpoint)
-        assert job_artifacts.checkpoints[0].identifier == "Llama3.2-3B-Instruct-sft-0"
-        assert job_artifacts.checkpoints[0].epoch == 0
-        assert "/.llama/checkpoints/Llama3.2-3B-Instruct-sft-0" in job_artifacts.checkpoints[0].path
--- a/llama_stack/providers/tests/scoring/init.py
+++ b/llama_stack/providers/tests/scoring/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/tests/scoring/conftest.py
+++ b/llama_stack/providers/tests/scoring/conftest.py
@ -1,75 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from ..inference.fixtures import INFERENCE_FIXTURES
-from .fixtures import SCORING_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="basic_scoring_together_inference",
-        marks=pytest.mark.basic_scoring_together_inference,
-    ),
-    pytest.param(
-        {
-            "scoring": "braintrust",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="braintrust_scoring_together_inference",
-        marks=pytest.mark.braintrust_scoring_together_inference,
-    ),
-    pytest.param(
-        {
-            "scoring": "llm_as_judge",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="llm_as_judge_scoring_together_inference",
-        marks=pytest.mark.llm_as_judge_scoring_together_inference,
-    ),
-]
-
-
-def pytest_configure(config):
-    for fixture_name in [
-        "basic_scoring_together_inference",
-        "braintrust_scoring_together_inference",
-        "llm_as_judge_scoring_together_inference",
-    ]:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    judge_model = metafunc.config.getoption("--judge-model")
-    if "judge_model" in metafunc.fixturenames:
-        metafunc.parametrize(
-            "judge_model",
-            [pytest.param(judge_model, id="")],
-            indirect=True,
-        )
-
-    if "scoring_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "scoring": SCORING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-            "inference": INFERENCE_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("scoring_stack", combinations, indirect=True)
--- a/llama_stack/providers/tests/scoring/fixtures.py
+++ b/llama_stack/providers/tests/scoring/fixtures.py
@ -1,100 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.models import ModelInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.inline.scoring.braintrust import BraintrustScoringConfig
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-from ..env import get_env_or_fail
-
-
-@pytest.fixture(scope="session")
-def scoring_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def judge_model(request):
-    if hasattr(request, "param"):
-        return request.param
-    return request.config.getoption("--judge-model", None)
-
-
-@pytest.fixture(scope="session")
-def scoring_basic() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="basic",
-                provider_type="inline::basic",
-                config={},
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def scoring_braintrust() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="braintrust",
-                provider_type="inline::braintrust",
-                config=BraintrustScoringConfig(
-                    openai_api_key=get_env_or_fail("OPENAI_API_KEY"),
-                ).model_dump(),
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def scoring_llm_as_judge() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="llm-as-judge",
-                provider_type="inline::llm-as-judge",
-                config={},
-            )
-        ],
-    )
-
-
-SCORING_FIXTURES = ["basic", "remote", "braintrust", "llm_as_judge"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def scoring_stack(request, inference_model, judge_model):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["datasetio", "scoring", "inference"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [Api.scoring, Api.datasetio, Api.inference],
-        providers,
-        provider_data,
-        models=[
-            ModelInput(model_id=model)
-            for model in [
-                inference_model,
-                judge_model,
-            ]
-        ],
-    )
-
-    return test_stack.impls
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ b/llama_stack/providers/tests/scoring/test_scoring.py
@ -1,213 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from llama_stack.apis.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-)
-from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/scoring/test_scoring.py
-#   -m "meta_reference"
-#   -v -s --tb=short --disable-warnings
-
-
-@pytest.fixture
-def sample_judge_prompt_template():
-    return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
-
-
-class TestScoring:
-    @pytest.mark.asyncio
-    async def test_scoring_functions_list(self, scoring_stack):
-        # NOTE: this needs you to ensure that you are starting from a clean state
-        # but so far we don't have an unregister API unfortunately, so be careful
-        scoring_functions_impl = scoring_stack[Api.scoring_functions]
-        response = await scoring_functions_impl.list_scoring_functions()
-        assert isinstance(response, list)
-        assert len(response) > 0
-
-    @pytest.mark.asyncio
-    async def test_scoring_score(self, scoring_stack):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        provider_id = scoring_fns_list[0].provider_id
-        if provider_id == "llm-as-judge":
-            pytest.skip(f"{provider_id} provider does not support scoring without params")
-
-        await register_dataset(datasets_impl, for_rag=True)
-        response = await datasets_impl.list_datasets()
-        assert len(response) == 1
-
-        # scoring individual rows
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        scoring_functions = {
-            scoring_fns_list[0].identifier: None,
-        }
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-
-        # score batch
-        response = await scoring_impl.score_batch(
-            dataset_id="test_dataset",
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == 5
-
-    @pytest.mark.asyncio
-    async def test_scoring_score_with_params_llm_as_judge(
-        self, scoring_stack, sample_judge_prompt_template, judge_model
-    ):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        await register_dataset(datasets_impl, for_rag=True)
-        response = await datasets_impl.list_datasets()
-        assert len(response) == 1
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        provider_id = scoring_fns_list[0].provider_id
-        if provider_id == "braintrust" or provider_id == "basic":
-            pytest.skip(f"{provider_id} provider does not support scoring with params")
-
-        # scoring individual rows
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_functions = {
-            "llm-as-judge::base": LLMAsJudgeScoringFnParams(
-                judge_model=judge_model,
-                prompt_template=sample_judge_prompt_template,
-                judge_score_regexes=[r"Score: (\d+)"],
-                aggregation_functions=[AggregationFunctionType.categorical_count],
-            )
-        }
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-
-        # score batch
-        response = await scoring_impl.score_batch(
-            dataset_id="test_dataset",
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == 5
-
-    @pytest.mark.asyncio
-    async def test_scoring_score_with_aggregation_functions(
-        self, scoring_stack, sample_judge_prompt_template, judge_model
-    ):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        await register_dataset(datasets_impl, for_rag=True)
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        scoring_functions = {}
-        aggr_fns = [
-            AggregationFunctionType.accuracy,
-            AggregationFunctionType.median,
-            AggregationFunctionType.categorical_count,
-            AggregationFunctionType.average,
-        ]
-        for x in scoring_fns_list:
-            if x.provider_id == "llm-as-judge":
-                aggr_fns = [AggregationFunctionType.categorical_count]
-                scoring_functions[x.identifier] = LLMAsJudgeScoringFnParams(
-                    judge_model=judge_model,
-                    prompt_template=sample_judge_prompt_template,
-                    judge_score_regexes=[r"Score: (\d+)"],
-                    aggregation_functions=aggr_fns,
-                )
-            elif x.provider_id == "basic" or x.provider_id == "braintrust":
-                if "regex_parser" in x.identifier:
-                    scoring_functions[x.identifier] = RegexParserScoringFnParams(
-                        aggregation_functions=aggr_fns,
-                    )
-                else:
-                    scoring_functions[x.identifier] = BasicScoringFnParams(
-                        aggregation_functions=aggr_fns,
-                    )
-            else:
-                scoring_functions[x.identifier] = None
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-            assert len(response.results[x].aggregated_results) == len(aggr_fns)
--- a/llama_stack/providers/tests/tools/init.py
+++ b/llama_stack/providers/tests/tools/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/tests/tools/conftest.py
+++ b/llama_stack/providers/tests/tools/conftest.py
@ -1,48 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..inference.fixtures import INFERENCE_FIXTURES
-from ..safety.fixtures import SAFETY_FIXTURES
-from ..vector_io.fixtures import VECTOR_IO_FIXTURES
-from .fixtures import TOOL_RUNTIME_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "inference": "together",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="together",
-        marks=pytest.mark.together,
-    ),
-]
-
-
-def pytest_configure(config):
-    for mark in ["together"]:
-        config.addinivalue_line(
-            "markers",
-            f"{mark}: marks tests as {mark} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "tools_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "inference": INFERENCE_FIXTURES,
-            "safety": SAFETY_FIXTURES,
-            "vector_io": VECTOR_IO_FIXTURES,
-            "tool_runtime": TOOL_RUNTIME_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("tools_stack", combinations, indirect=True)
--- a/llama_stack/providers/tests/tools/fixtures.py
+++ b/llama_stack/providers/tests/tools/fixtures.py
@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.models import ModelInput, ModelType
-from llama_stack.apis.tools import ToolGroupInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture
-
-
-@pytest.fixture(scope="session")
-def tool_runtime_memory_and_search() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="rag-runtime",
-                provider_type="inline::rag-runtime",
-                config={},
-            ),
-            Provider(
-                provider_id="tavily-search",
-                provider_type="remote::tavily-search",
-                config={
-                    "api_key": os.environ["TAVILY_SEARCH_API_KEY"],
-                },
-            ),
-            Provider(
-                provider_id="wolfram-alpha",
-                provider_type="remote::wolfram-alpha",
-                config={
-                    "api_key": os.environ["WOLFRAM_ALPHA_API_KEY"],
-                },
-            ),
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_memory() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::rag",
-        provider_id="rag-runtime",
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_tavily_search() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::web_search",
-        provider_id="tavily-search",
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_wolfram_alpha() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::wolfram_alpha",
-        provider_id="wolfram-alpha",
-    )
-
-
-TOOL_RUNTIME_FIXTURES = ["memory_and_search"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def tools_stack(
-    request,
-    inference_model,
-    tool_group_input_memory,
-    tool_group_input_tavily_search,
-    tool_group_input_wolfram_alpha,
-):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["inference", "vector_io", "tool_runtime"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if key == "inference":
-            providers[key].append(
-                Provider(
-                    provider_id="tools_memory_provider",
-                    provider_type="inline::sentence-transformers",
-                    config={},
-                )
-            )
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-    inference_models = inference_model if isinstance(inference_model, list) else [inference_model]
-    models = [
-        ModelInput(
-            model_id=model,
-            model_type=ModelType.llm,
-            provider_id=providers["inference"][0].provider_id,
-        )
-        for model in inference_models
-    ]
-    models.append(
-        ModelInput(
-            model_id="all-MiniLM-L6-v2",
-            model_type=ModelType.embedding,
-            provider_id="tools_memory_provider",
-            metadata={"embedding_dimension": 384},
-        )
-    )
-
-    test_stack = await construct_stack_for_test(
-        [
-            Api.tool_groups,
-            Api.inference,
-            Api.vector_io,
-            Api.tool_runtime,
-        ],
-        providers,
-        provider_data,
-        models=models,
-        tool_groups=[
-            tool_group_input_tavily_search,
-            tool_group_input_wolfram_alpha,
-            tool_group_input_memory,
-        ],
-    )
-    return test_stack
--- a/llama_stack/providers/tests/tools/test_tools.py
+++ b/llama_stack/providers/tests/tools/test_tools.py
@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-import pytest
-
-from llama_stack.apis.tools import RAGDocument, RAGQueryResult, ToolInvocationResult
-from llama_stack.providers.datatypes import Api
-
-
-@pytest.fixture
-def sample_search_query():
-    return "What are the latest developments in quantum computing?"
-
-
-@pytest.fixture
-def sample_wolfram_alpha_query():
-    return "What is the square root of 16?"
-
-
-@pytest.fixture
-def sample_documents():
-    urls = [
-        "memory_optimizations.rst",
-        "chat.rst",
-        "llama3.rst",
-        "qat_finetune.rst",
-        "lora_finetune.rst",
-    ]
-    return [
-        RAGDocument(
-            document_id=f"num-{i}",
-            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
-            mime_type="text/plain",
-            metadata={},
-        )
-        for i, url in enumerate(urls)
-    ]
-
-
-class TestTools:
-    @pytest.mark.asyncio
-    async def test_web_search_tool(self, tools_stack, sample_search_query):
-        """Test the web search tool functionality."""
-        if "TAVILY_SEARCH_API_KEY" not in os.environ:
-            pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
-
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        # Execute the tool
-        response = await tools_impl.invoke_tool(tool_name="web_search", kwargs={"query": sample_search_query})
-
-        # Verify the response
-        assert isinstance(response, ToolInvocationResult)
-        assert response.content is not None
-        assert len(response.content) > 0
-        assert isinstance(response.content, str)
-
-    @pytest.mark.asyncio
-    async def test_wolfram_alpha_tool(self, tools_stack, sample_wolfram_alpha_query):
-        """Test the wolfram alpha tool functionality."""
-        if "WOLFRAM_ALPHA_API_KEY" not in os.environ:
-            pytest.skip("WOLFRAM_ALPHA_API_KEY not set, skipping test")
-
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        response = await tools_impl.invoke_tool(tool_name="wolfram_alpha", kwargs={"query": sample_wolfram_alpha_query})
-
-        # Verify the response
-        assert isinstance(response, ToolInvocationResult)
-        assert response.content is not None
-        assert len(response.content) > 0
-        assert isinstance(response.content, str)
-
-    @pytest.mark.asyncio
-    async def test_rag_tool(self, tools_stack, sample_documents):
-        """Test the memory tool functionality."""
-        vector_dbs_impl = tools_stack.impls[Api.vector_dbs]
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        # Register memory bank
-        await vector_dbs_impl.register_vector_db(
-            vector_db_id="test_bank",
-            embedding_model="all-MiniLM-L6-v2",
-            embedding_dimension=384,
-            provider_id="faiss",
-        )
-
-        # Insert documents into memory
-        await tools_impl.rag_tool.insert(
-            documents=sample_documents,
-            vector_db_id="test_bank",
-            chunk_size_in_tokens=512,
-        )
-
-        # Execute the memory tool
-        response = await tools_impl.rag_tool.query(
-            content="What are the main topics covered in the documentation?",
-            vector_db_ids=["test_bank"],
-        )
-
-        # Verify the response
-        assert isinstance(response, RAGQueryResult)
-        assert response.content is not None
-        assert len(response.content) > 0