refactor(test): move tools, evals, datasetio, scoring and post training tests (#1401)

All of the tests from `llama_stack/providers/tests/` are now moved to
`tests/integration`.

I converted the `tools`, `scoring` and `datasetio` tests to use API.
However, `eval` and `post_training` proved to be a bit challenging to
leaving those. I think `post_training` should be relatively
straightforward also.

As part of this, I noticed that `wolfram_alpha` tool wasn't added to
some of our commonly used distros so I added it. I am going to remove a
lot of code duplication from distros next so while this looks like a
one-off right now, it will go away and be there uniformly for all
distros.
This commit is contained in:
Ashwin Bharambe 2025-03-04 14:53:47 -08:00 committed by GitHub
parent dd0db8038b
commit abfbaf3c1b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
51 changed files with 471 additions and 1245 deletions

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,29 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from .fixtures import DATASETIO_FIXTURES
def pytest_configure(config):
for fixture_name in DATASETIO_FIXTURES:
config.addinivalue_line(
"markers",
f"{fixture_name}: marks tests as {fixture_name} specific",
)
def pytest_generate_tests(metafunc):
if "datasetio_stack" in metafunc.fixturenames:
metafunc.parametrize(
"datasetio_stack",
[
pytest.param(fixture_name, marks=getattr(pytest.mark, fixture_name))
for fixture_name in DATASETIO_FIXTURES
],
indirect=True,
)

View file

@ -1,61 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
import pytest_asyncio
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.tests.resolver import construct_stack_for_test
from ..conftest import ProviderFixture, remote_stack_fixture
@pytest.fixture(scope="session")
def datasetio_remote() -> ProviderFixture:
return remote_stack_fixture()
@pytest.fixture(scope="session")
def datasetio_localfs() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="localfs",
provider_type="inline::localfs",
config={},
)
],
)
@pytest.fixture(scope="session")
def datasetio_huggingface() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="huggingface",
provider_type="remote::huggingface",
config={},
)
],
)
DATASETIO_FIXTURES = ["localfs", "remote", "huggingface"]
@pytest_asyncio.fixture(scope="session")
async def datasetio_stack(request):
fixture_name = request.param
fixture = request.getfixturevalue(f"datasetio_{fixture_name}")
test_stack = await construct_stack_for_test(
[Api.datasetio],
{"datasetio": fixture.providers},
fixture.provider_data,
)
return test_stack.impls[Api.datasetio], test_stack.impls[Api.datasets]

View file

@ -1,6 +0,0 @@
input_query,generated_answer,expected_answer,chat_completion_input
What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"
1 input_query generated_answer expected_answer chat_completion_input
2 What is the capital of France? London Paris [{'role': 'user', 'content': 'What is the capital of France?'}]
3 Who is the CEO of Meta? Mark Zuckerberg Mark Zuckerberg [{'role': 'user', 'content': 'Who is the CEO of Meta?'}]
4 What is the largest planet in our solar system? Jupiter Jupiter [{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]
5 What is the smallest country in the world? China Vatican City [{'role': 'user', 'content': 'What is the smallest country in the world?'}]
6 What is the currency of Japan? Yen Yen [{'role': 'user', 'content': 'What is the currency of Japan?'}]

View file

@ -1,134 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
import mimetypes
import os
from pathlib import Path
import pytest
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
from llama_stack.apis.datasets import Datasets
# How to run this test:
#
# pytest llama_stack/providers/tests/datasetio/test_datasetio.py
# -m "meta_reference"
# -v -s --tb=short --disable-warnings
def data_url_from_file(file_path: str) -> str:
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, "rb") as file:
file_content = file.read()
base64_content = base64.b64encode(file_content).decode("utf-8")
mime_type, _ = mimetypes.guess_type(file_path)
data_url = f"data:{mime_type};base64,{base64_content}"
return data_url
async def register_dataset(
datasets_impl: Datasets,
for_generation=False,
for_rag=False,
dataset_id="test_dataset",
):
if for_rag:
test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv"
else:
test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
test_url = data_url_from_file(str(test_file))
if for_generation:
dataset_schema = {
"expected_answer": StringType(),
"input_query": StringType(),
"chat_completion_input": ChatCompletionInputType(),
}
elif for_rag:
dataset_schema = {
"expected_answer": StringType(),
"input_query": StringType(),
"generated_answer": StringType(),
"context": StringType(),
}
else:
dataset_schema = {
"expected_answer": StringType(),
"input_query": StringType(),
"generated_answer": StringType(),
}
await datasets_impl.register_dataset(
dataset_id=dataset_id,
dataset_schema=dataset_schema,
url=URL(uri=test_url),
)
class TestDatasetIO:
@pytest.mark.asyncio
async def test_datasets_list(self, datasetio_stack):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
_, datasets_impl = datasetio_stack
response = await datasets_impl.list_datasets()
assert isinstance(response, list)
assert len(response) == 0
@pytest.mark.asyncio
async def test_register_dataset(self, datasetio_stack):
_, datasets_impl = datasetio_stack
await register_dataset(datasets_impl)
response = await datasets_impl.list_datasets()
assert isinstance(response, list)
assert len(response) == 1
assert response[0].identifier == "test_dataset"
with pytest.raises(ValueError):
# unregister a dataset that does not exist
await datasets_impl.unregister_dataset("test_dataset2")
await datasets_impl.unregister_dataset("test_dataset")
response = await datasets_impl.list_datasets()
assert isinstance(response, list)
assert len(response) == 0
with pytest.raises(ValueError):
await datasets_impl.unregister_dataset("test_dataset")
@pytest.mark.asyncio
async def test_get_rows_paginated(self, datasetio_stack):
datasetio_impl, datasets_impl = datasetio_stack
await register_dataset(datasets_impl)
response = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=3,
)
assert isinstance(response.rows, list)
assert len(response.rows) == 3
assert response.next_page_token == "3"
provider = datasetio_impl.routing_table.get_provider_impl("test_dataset")
if provider.__provider_spec__.provider_type == "remote":
pytest.skip("remote provider doesn't support get_rows_paginated")
# iterate over all rows
response = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=2,
page_token=response.next_page_token,
)
assert isinstance(response.rows, list)
assert len(response.rows) == 2
assert response.next_page_token == "5"

View file

@ -1,6 +0,0 @@
input_query,context,generated_answer,expected_answer
What is the capital of France?,"France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum.",London,Paris
Who is the CEO of Meta?,"Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies.",Mark Zuckerberg,Mark Zuckerberg
What is the largest planet in our solar system?,"The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets.",Jupiter,Jupiter
What is the smallest country in the world?,"Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy.",China,Vatican City
What is the currency of Japan?,"Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871.",Yen,Yen
1 input_query context generated_answer expected_answer
2 What is the capital of France? France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum. London Paris
3 Who is the CEO of Meta? Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies. Mark Zuckerberg Mark Zuckerberg
4 What is the largest planet in our solar system? The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets. Jupiter Jupiter
5 What is the smallest country in the world? Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy. China Vatican City
6 What is the currency of Japan? Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871. Yen Yen

View file

@ -1,24 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
class MissingCredentialError(Exception):
pass
def get_env_or_fail(key: str) -> str:
"""Get environment variable or raise helpful error"""
value = os.getenv(key)
if not value:
raise MissingCredentialError(
f"\nMissing {key} in environment. Please set it using one of these methods:"
f"\n1. Export in shell: export {key}=your-key"
f"\n2. Create .env file in project root with: {key}=your-key"
f"\n3. Pass directly to pytest: pytest --env {key}=your-key"
)
return value

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,92 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from ..agents.fixtures import AGENTS_FIXTURES
from ..conftest import get_provider_fixture_overrides
from ..datasetio.fixtures import DATASETIO_FIXTURES
from ..inference.fixtures import INFERENCE_FIXTURES
from ..safety.fixtures import SAFETY_FIXTURES
from ..scoring.fixtures import SCORING_FIXTURES
from ..tools.fixtures import TOOL_RUNTIME_FIXTURES
from ..vector_io.fixtures import VECTOR_IO_FIXTURES
from .fixtures import EVAL_FIXTURES
DEFAULT_PROVIDER_COMBINATIONS = [
pytest.param(
{
"eval": "meta_reference",
"scoring": "basic",
"datasetio": "localfs",
"inference": "fireworks",
"agents": "meta_reference",
"safety": "llama_guard",
"vector_io": "faiss",
"tool_runtime": "memory_and_search",
},
id="meta_reference_eval_fireworks_inference",
marks=pytest.mark.meta_reference_eval_fireworks_inference,
),
pytest.param(
{
"eval": "meta_reference",
"scoring": "basic",
"datasetio": "localfs",
"inference": "together",
"agents": "meta_reference",
"safety": "llama_guard",
"vector_io": "faiss",
"tool_runtime": "memory_and_search",
},
id="meta_reference_eval_together_inference",
marks=pytest.mark.meta_reference_eval_together_inference,
),
pytest.param(
{
"eval": "meta_reference",
"scoring": "basic",
"datasetio": "huggingface",
"inference": "together",
"agents": "meta_reference",
"safety": "llama_guard",
"vector_io": "faiss",
"tool_runtime": "memory_and_search",
},
id="meta_reference_eval_together_inference_huggingface_datasetio",
marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio,
),
]
def pytest_configure(config):
for fixture_name in [
"meta_reference_eval_fireworks_inference",
"meta_reference_eval_together_inference",
"meta_reference_eval_together_inference_huggingface_datasetio",
]:
config.addinivalue_line(
"markers",
f"{fixture_name}: marks tests as {fixture_name} specific",
)
def pytest_generate_tests(metafunc):
if "eval_stack" in metafunc.fixturenames:
available_fixtures = {
"eval": EVAL_FIXTURES,
"scoring": SCORING_FIXTURES,
"datasetio": DATASETIO_FIXTURES,
"inference": INFERENCE_FIXTURES,
"agents": AGENTS_FIXTURES,
"safety": SAFETY_FIXTURES,
"vector_io": VECTOR_IO_FIXTURES,
"tool_runtime": TOOL_RUNTIME_FIXTURES,
}
combinations = (
get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
)
metafunc.parametrize("eval_stack", combinations, indirect=True)

View file

@ -1,20 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
JUDGE_PROMPT = """
You will be given a question, a expected_answer, and a system_answer.
Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
Provide your feedback as follows:
Feedback:::
Total rating: (your rating, as a int between 0 and 5)
Now here are the question, expected_answer, system_answer.
Question: {input_query}
Expected Answer: {expected_answer}
System Answer: {generated_answer}
Feedback:::
Total rating:
"""

View file

@ -1,87 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
import pytest_asyncio
from llama_stack.distribution.datatypes import Api, ModelInput, Provider
from llama_stack.providers.tests.resolver import construct_stack_for_test
from ..conftest import ProviderFixture, remote_stack_fixture
@pytest.fixture(scope="session")
def eval_remote() -> ProviderFixture:
return remote_stack_fixture()
@pytest.fixture(scope="session")
def eval_meta_reference() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="meta-reference",
provider_type="inline::meta-reference",
config={},
)
],
)
EVAL_FIXTURES = ["meta_reference", "remote"]
@pytest_asyncio.fixture(scope="session")
async def eval_stack(
request,
inference_model,
judge_model,
tool_group_input_memory,
tool_group_input_tavily_search,
):
fixture_dict = request.param
providers = {}
provider_data = {}
for key in [
"datasetio",
"eval",
"scoring",
"inference",
"agents",
"safety",
"vector_io",
"tool_runtime",
]:
fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
providers[key] = fixture.providers
if fixture.provider_data:
provider_data.update(fixture.provider_data)
test_stack = await construct_stack_for_test(
[
Api.eval,
Api.datasetio,
Api.inference,
Api.scoring,
Api.agents,
Api.safety,
Api.vector_io,
Api.tool_runtime,
],
providers,
provider_data,
models=[
ModelInput(model_id=model)
for model in [
inference_model,
judge_model,
]
],
tool_groups=[tool_group_input_memory, tool_group_input_tavily_search],
)
return test_stack.impls

View file

@ -1,184 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
from llama_stack.apis.eval.eval import (
AppBenchmarkConfig,
BenchmarkBenchmarkConfig,
ModelCandidate,
)
from llama_stack.apis.inference import SamplingParams
from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams
from llama_stack.distribution.datatypes import Api
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
from .constants import JUDGE_PROMPT
# How to run this test:
#
# pytest llama_stack/providers/tests/eval/test_eval.py
# -m "meta_reference_eval_together_inference_huggingface_datasetio"
# -v -s --tb=short --disable-warnings
class Testeval:
@pytest.mark.asyncio
async def test_benchmarks_list(self, eval_stack):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
benchmarks_impl = eval_stack[Api.benchmarks]
response = await benchmarks_impl.list_benchmarks()
assert isinstance(response, list)
@pytest.mark.asyncio
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
eval_impl, benchmarks_impl, datasetio_impl, datasets_impl = (
eval_stack[Api.eval],
eval_stack[Api.benchmarks],
eval_stack[Api.datasetio],
eval_stack[Api.datasets],
)
await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
response = await datasets_impl.list_datasets()
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset_for_eval",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_functions = [
"basic::equality",
]
benchmark_id = "meta-reference::app_eval"
await benchmarks_impl.register_benchmark(
benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.evaluate_rows(
benchmark_id=benchmark_id,
input_rows=rows.rows,
scoring_functions=scoring_functions,
benchmark_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
),
scoring_params={
"meta-reference::llm_as_judge_base": LLMAsJudgeScoringFnParams(
judge_model=judge_model,
prompt_template=JUDGE_PROMPT,
judge_score_regexes=[
r"Total rating: (\d+)",
r"rating: (\d+)",
r"Rating: (\d+)",
],
)
},
),
)
assert len(response.generations) == 3
assert "basic::equality" in response.scores
@pytest.mark.asyncio
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
eval_impl, benchmarks_impl, datasets_impl = (
eval_stack[Api.eval],
eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
)
await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
scoring_functions = [
"basic::subset_of",
]
benchmark_id = "meta-reference::app_eval-2"
await benchmarks_impl.register_benchmark(
benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.run_eval(
benchmark_id=benchmark_id,
benchmark_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
),
),
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(benchmark_id, response.job_id)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 5
assert "basic::subset_of" in eval_response.scores
@pytest.mark.asyncio
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
eval_impl, benchmarks_impl, datasets_impl = (
eval_stack[Api.eval],
eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
)
response = await datasets_impl.list_datasets()
assert len(response) > 0
if response[0].provider_id != "huggingface":
pytest.skip("Only huggingface provider supports pre-registered remote datasets")
await datasets_impl.register_dataset(
dataset_id="mmlu",
dataset_schema={
"input_query": StringType(),
"expected_answer": StringType(),
"chat_completion_input": ChatCompletionInputType(),
},
url=URL(uri="https://huggingface.co/datasets/llamastack/evals"),
metadata={
"path": "llamastack/evals",
"name": "evals__mmlu__details",
"split": "train",
},
)
# register eval task
await benchmarks_impl.register_benchmark(
benchmark_id="meta-reference-mmlu",
dataset_id="mmlu",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
# list benchmarks
response = await benchmarks_impl.list_benchmarks()
assert len(response) > 0
benchmark_id = "meta-reference-mmlu"
response = await eval_impl.run_eval(
benchmark_id=benchmark_id,
benchmark_config=BenchmarkBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
),
num_examples=3,
),
)
job_status = await eval_impl.job_status(benchmark_id, response.job_id)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 3

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,42 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from ..conftest import get_provider_fixture_overrides
from ..datasetio.fixtures import DATASETIO_FIXTURES
from .fixtures import POST_TRAINING_FIXTURES
DEFAULT_PROVIDER_COMBINATIONS = [
pytest.param(
{
"post_training": "torchtune",
"datasetio": "huggingface",
},
id="torchtune_post_training_huggingface_datasetio",
marks=pytest.mark.torchtune_post_training_huggingface_datasetio,
),
]
def pytest_configure(config):
combined_fixtures = "torchtune_post_training_huggingface_datasetio"
config.addinivalue_line(
"markers",
f"{combined_fixtures}: marks tests as {combined_fixtures} specific",
)
def pytest_generate_tests(metafunc):
if "post_training_stack" in metafunc.fixturenames:
available_fixtures = {
"eval": POST_TRAINING_FIXTURES,
"datasetio": DATASETIO_FIXTURES,
}
combinations = (
get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
)
metafunc.parametrize("post_training_stack", combinations, indirect=True)

View file

@ -1,72 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
import pytest_asyncio
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import StringType
from llama_stack.apis.datasets import DatasetInput
from llama_stack.apis.models import ModelInput
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.tests.resolver import construct_stack_for_test
from ..conftest import ProviderFixture
@pytest.fixture(scope="session")
def post_training_torchtune() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="torchtune",
provider_type="inline::torchtune",
config={},
)
],
)
POST_TRAINING_FIXTURES = ["torchtune"]
@pytest_asyncio.fixture(scope="session")
async def post_training_stack(request):
fixture_dict = request.param
providers = {}
provider_data = {}
for key in ["post_training", "datasetio"]:
fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
providers[key] = fixture.providers
if fixture.provider_data:
provider_data.update(fixture.provider_data)
test_stack = await construct_stack_for_test(
[Api.post_training, Api.datasetio],
providers,
provider_data,
models=[ModelInput(model_id="meta-llama/Llama-3.2-3B-Instruct")],
datasets=[
DatasetInput(
dataset_id="alpaca",
provider_id="huggingface",
url=URL(uri="https://huggingface.co/datasets/tatsu-lab/alpaca"),
metadata={
"path": "tatsu-lab/alpaca",
"split": "train",
},
dataset_schema={
"instruction": StringType(),
"input": StringType(),
"output": StringType(),
"text": StringType(),
},
),
],
)
return test_stack.impls[Api.post_training]

View file

@ -1,100 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List
import pytest
from llama_stack.apis.common.job_types import JobStatus
from llama_stack.apis.post_training import (
Checkpoint,
DataConfig,
LoraFinetuningConfig,
OptimizerConfig,
PostTrainingJob,
PostTrainingJobArtifactsResponse,
PostTrainingJobStatusResponse,
TrainingConfig,
)
# How to run this test:
#
# pytest llama_stack/providers/tests/post_training/test_post_training.py
# -m "torchtune_post_training_huggingface_datasetio"
# -v -s --tb=short --disable-warnings
class TestPostTraining:
@pytest.mark.asyncio
async def test_supervised_fine_tune(self, post_training_stack):
algorithm_config = LoraFinetuningConfig(
type="LoRA",
lora_attn_modules=["q_proj", "v_proj", "output_proj"],
apply_lora_to_mlp=True,
apply_lora_to_output=False,
rank=8,
alpha=16,
)
data_config = DataConfig(
dataset_id="alpaca",
batch_size=1,
shuffle=False,
)
optimizer_config = OptimizerConfig(
optimizer_type="adamw",
lr=3e-4,
lr_min=3e-5,
weight_decay=0.1,
num_warmup_steps=100,
)
training_config = TrainingConfig(
n_epochs=1,
data_config=data_config,
optimizer_config=optimizer_config,
max_steps_per_epoch=1,
gradient_accumulation_steps=1,
)
post_training_impl = post_training_stack
response = await post_training_impl.supervised_fine_tune(
job_uuid="1234",
model="Llama3.2-3B-Instruct",
algorithm_config=algorithm_config,
training_config=training_config,
hyperparam_search_config={},
logger_config={},
checkpoint_dir="null",
)
assert isinstance(response, PostTrainingJob)
assert response.job_uuid == "1234"
@pytest.mark.asyncio
async def test_get_training_jobs(self, post_training_stack):
post_training_impl = post_training_stack
jobs_list = await post_training_impl.get_training_jobs()
assert isinstance(jobs_list, List)
assert jobs_list[0].job_uuid == "1234"
@pytest.mark.asyncio
async def test_get_training_job_status(self, post_training_stack):
post_training_impl = post_training_stack
job_status = await post_training_impl.get_training_job_status("1234")
assert isinstance(job_status, PostTrainingJobStatusResponse)
assert job_status.job_uuid == "1234"
assert job_status.status == JobStatus.completed
assert isinstance(job_status.checkpoints[0], Checkpoint)
@pytest.mark.asyncio
async def test_get_training_job_artifacts(self, post_training_stack):
post_training_impl = post_training_stack
job_artifacts = await post_training_impl.get_training_job_artifacts("1234")
assert isinstance(job_artifacts, PostTrainingJobArtifactsResponse)
assert job_artifacts.job_uuid == "1234"
assert isinstance(job_artifacts.checkpoints[0], Checkpoint)
assert job_artifacts.checkpoints[0].identifier == "Llama3.2-3B-Instruct-sft-0"
assert job_artifacts.checkpoints[0].epoch == 0
assert "/.llama/checkpoints/Llama3.2-3B-Instruct-sft-0" in job_artifacts.checkpoints[0].path

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,75 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from ..conftest import get_provider_fixture_overrides
from ..datasetio.fixtures import DATASETIO_FIXTURES
from ..inference.fixtures import INFERENCE_FIXTURES
from .fixtures import SCORING_FIXTURES
DEFAULT_PROVIDER_COMBINATIONS = [
pytest.param(
{
"scoring": "basic",
"datasetio": "localfs",
"inference": "together",
},
id="basic_scoring_together_inference",
marks=pytest.mark.basic_scoring_together_inference,
),
pytest.param(
{
"scoring": "braintrust",
"datasetio": "localfs",
"inference": "together",
},
id="braintrust_scoring_together_inference",
marks=pytest.mark.braintrust_scoring_together_inference,
),
pytest.param(
{
"scoring": "llm_as_judge",
"datasetio": "localfs",
"inference": "together",
},
id="llm_as_judge_scoring_together_inference",
marks=pytest.mark.llm_as_judge_scoring_together_inference,
),
]
def pytest_configure(config):
for fixture_name in [
"basic_scoring_together_inference",
"braintrust_scoring_together_inference",
"llm_as_judge_scoring_together_inference",
]:
config.addinivalue_line(
"markers",
f"{fixture_name}: marks tests as {fixture_name} specific",
)
def pytest_generate_tests(metafunc):
judge_model = metafunc.config.getoption("--judge-model")
if "judge_model" in metafunc.fixturenames:
metafunc.parametrize(
"judge_model",
[pytest.param(judge_model, id="")],
indirect=True,
)
if "scoring_stack" in metafunc.fixturenames:
available_fixtures = {
"scoring": SCORING_FIXTURES,
"datasetio": DATASETIO_FIXTURES,
"inference": INFERENCE_FIXTURES,
}
combinations = (
get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
)
metafunc.parametrize("scoring_stack", combinations, indirect=True)

View file

@ -1,100 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
import pytest_asyncio
from llama_stack.apis.models import ModelInput
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.inline.scoring.braintrust import BraintrustScoringConfig
from llama_stack.providers.tests.resolver import construct_stack_for_test
from ..conftest import ProviderFixture, remote_stack_fixture
from ..env import get_env_or_fail
@pytest.fixture(scope="session")
def scoring_remote() -> ProviderFixture:
return remote_stack_fixture()
@pytest.fixture(scope="session")
def judge_model(request):
if hasattr(request, "param"):
return request.param
return request.config.getoption("--judge-model", None)
@pytest.fixture(scope="session")
def scoring_basic() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="basic",
provider_type="inline::basic",
config={},
)
],
)
@pytest.fixture(scope="session")
def scoring_braintrust() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="braintrust",
provider_type="inline::braintrust",
config=BraintrustScoringConfig(
openai_api_key=get_env_or_fail("OPENAI_API_KEY"),
).model_dump(),
)
],
)
@pytest.fixture(scope="session")
def scoring_llm_as_judge() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="llm-as-judge",
provider_type="inline::llm-as-judge",
config={},
)
],
)
SCORING_FIXTURES = ["basic", "remote", "braintrust", "llm_as_judge"]
@pytest_asyncio.fixture(scope="session")
async def scoring_stack(request, inference_model, judge_model):
fixture_dict = request.param
providers = {}
provider_data = {}
for key in ["datasetio", "scoring", "inference"]:
fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
providers[key] = fixture.providers
if fixture.provider_data:
provider_data.update(fixture.provider_data)
test_stack = await construct_stack_for_test(
[Api.scoring, Api.datasetio, Api.inference],
providers,
provider_data,
models=[
ModelInput(model_id=model)
for model in [
inference_model,
judge_model,
]
],
)
return test_stack.impls

View file

@ -1,213 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from llama_stack.apis.scoring_functions import (
AggregationFunctionType,
BasicScoringFnParams,
LLMAsJudgeScoringFnParams,
RegexParserScoringFnParams,
)
from llama_stack.distribution.datatypes import Api
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
# How to run this test:
#
# pytest llama_stack/providers/tests/scoring/test_scoring.py
# -m "meta_reference"
# -v -s --tb=short --disable-warnings
@pytest.fixture
def sample_judge_prompt_template():
return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
class TestScoring:
@pytest.mark.asyncio
async def test_scoring_functions_list(self, scoring_stack):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
scoring_functions_impl = scoring_stack[Api.scoring_functions]
response = await scoring_functions_impl.list_scoring_functions()
assert isinstance(response, list)
assert len(response) > 0
@pytest.mark.asyncio
async def test_scoring_score(self, scoring_stack):
(
scoring_impl,
scoring_functions_impl,
datasetio_impl,
datasets_impl,
) = (
scoring_stack[Api.scoring],
scoring_stack[Api.scoring_functions],
scoring_stack[Api.datasetio],
scoring_stack[Api.datasets],
)
scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
provider_id = scoring_fns_list[0].provider_id
if provider_id == "llm-as-judge":
pytest.skip(f"{provider_id} provider does not support scoring without params")
await register_dataset(datasets_impl, for_rag=True)
response = await datasets_impl.list_datasets()
assert len(response) == 1
# scoring individual rows
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
scoring_functions = {
scoring_fns_list[0].identifier: None,
}
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == len(rows.rows)
# score batch
response = await scoring_impl.score_batch(
dataset_id="test_dataset",
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == 5
@pytest.mark.asyncio
async def test_scoring_score_with_params_llm_as_judge(
self, scoring_stack, sample_judge_prompt_template, judge_model
):
(
scoring_impl,
scoring_functions_impl,
datasetio_impl,
datasets_impl,
) = (
scoring_stack[Api.scoring],
scoring_stack[Api.scoring_functions],
scoring_stack[Api.datasetio],
scoring_stack[Api.datasets],
)
await register_dataset(datasets_impl, for_rag=True)
response = await datasets_impl.list_datasets()
assert len(response) == 1
scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
provider_id = scoring_fns_list[0].provider_id
if provider_id == "braintrust" or provider_id == "basic":
pytest.skip(f"{provider_id} provider does not support scoring with params")
# scoring individual rows
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_functions = {
"llm-as-judge::base": LLMAsJudgeScoringFnParams(
judge_model=judge_model,
prompt_template=sample_judge_prompt_template,
judge_score_regexes=[r"Score: (\d+)"],
aggregation_functions=[AggregationFunctionType.categorical_count],
)
}
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == len(rows.rows)
# score batch
response = await scoring_impl.score_batch(
dataset_id="test_dataset",
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == 5
@pytest.mark.asyncio
async def test_scoring_score_with_aggregation_functions(
self, scoring_stack, sample_judge_prompt_template, judge_model
):
(
scoring_impl,
scoring_functions_impl,
datasetio_impl,
datasets_impl,
) = (
scoring_stack[Api.scoring],
scoring_stack[Api.scoring_functions],
scoring_stack[Api.datasetio],
scoring_stack[Api.datasets],
)
await register_dataset(datasets_impl, for_rag=True)
rows = await datasetio_impl.get_rows_paginated(
dataset_id="test_dataset",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
scoring_functions = {}
aggr_fns = [
AggregationFunctionType.accuracy,
AggregationFunctionType.median,
AggregationFunctionType.categorical_count,
AggregationFunctionType.average,
]
for x in scoring_fns_list:
if x.provider_id == "llm-as-judge":
aggr_fns = [AggregationFunctionType.categorical_count]
scoring_functions[x.identifier] = LLMAsJudgeScoringFnParams(
judge_model=judge_model,
prompt_template=sample_judge_prompt_template,
judge_score_regexes=[r"Score: (\d+)"],
aggregation_functions=aggr_fns,
)
elif x.provider_id == "basic" or x.provider_id == "braintrust":
if "regex_parser" in x.identifier:
scoring_functions[x.identifier] = RegexParserScoringFnParams(
aggregation_functions=aggr_fns,
)
else:
scoring_functions[x.identifier] = BasicScoringFnParams(
aggregation_functions=aggr_fns,
)
else:
scoring_functions[x.identifier] = None
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
assert x in response.results
assert len(response.results[x].score_rows) == len(rows.rows)
assert len(response.results[x].aggregated_results) == len(aggr_fns)

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,48 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from ..conftest import get_provider_fixture_overrides
from ..inference.fixtures import INFERENCE_FIXTURES
from ..safety.fixtures import SAFETY_FIXTURES
from ..vector_io.fixtures import VECTOR_IO_FIXTURES
from .fixtures import TOOL_RUNTIME_FIXTURES
DEFAULT_PROVIDER_COMBINATIONS = [
pytest.param(
{
"inference": "together",
"safety": "llama_guard",
"vector_io": "faiss",
"tool_runtime": "memory_and_search",
},
id="together",
marks=pytest.mark.together,
),
]
def pytest_configure(config):
for mark in ["together"]:
config.addinivalue_line(
"markers",
f"{mark}: marks tests as {mark} specific",
)
def pytest_generate_tests(metafunc):
if "tools_stack" in metafunc.fixturenames:
available_fixtures = {
"inference": INFERENCE_FIXTURES,
"safety": SAFETY_FIXTURES,
"vector_io": VECTOR_IO_FIXTURES,
"tool_runtime": TOOL_RUNTIME_FIXTURES,
}
combinations = (
get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
)
metafunc.parametrize("tools_stack", combinations, indirect=True)

View file

@ -1,133 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import pytest
import pytest_asyncio
from llama_stack.apis.models import ModelInput, ModelType
from llama_stack.apis.tools import ToolGroupInput
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.tests.resolver import construct_stack_for_test
from ..conftest import ProviderFixture
@pytest.fixture(scope="session")
def tool_runtime_memory_and_search() -> ProviderFixture:
return ProviderFixture(
providers=[
Provider(
provider_id="rag-runtime",
provider_type="inline::rag-runtime",
config={},
),
Provider(
provider_id="tavily-search",
provider_type="remote::tavily-search",
config={
"api_key": os.environ["TAVILY_SEARCH_API_KEY"],
},
),
Provider(
provider_id="wolfram-alpha",
provider_type="remote::wolfram-alpha",
config={
"api_key": os.environ["WOLFRAM_ALPHA_API_KEY"],
},
),
],
)
@pytest.fixture(scope="session")
def tool_group_input_memory() -> ToolGroupInput:
return ToolGroupInput(
toolgroup_id="builtin::rag",
provider_id="rag-runtime",
)
@pytest.fixture(scope="session")
def tool_group_input_tavily_search() -> ToolGroupInput:
return ToolGroupInput(
toolgroup_id="builtin::web_search",
provider_id="tavily-search",
)
@pytest.fixture(scope="session")
def tool_group_input_wolfram_alpha() -> ToolGroupInput:
return ToolGroupInput(
toolgroup_id="builtin::wolfram_alpha",
provider_id="wolfram-alpha",
)
TOOL_RUNTIME_FIXTURES = ["memory_and_search"]
@pytest_asyncio.fixture(scope="session")
async def tools_stack(
request,
inference_model,
tool_group_input_memory,
tool_group_input_tavily_search,
tool_group_input_wolfram_alpha,
):
fixture_dict = request.param
providers = {}
provider_data = {}
for key in ["inference", "vector_io", "tool_runtime"]:
fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
providers[key] = fixture.providers
if key == "inference":
providers[key].append(
Provider(
provider_id="tools_memory_provider",
provider_type="inline::sentence-transformers",
config={},
)
)
if fixture.provider_data:
provider_data.update(fixture.provider_data)
inference_models = inference_model if isinstance(inference_model, list) else [inference_model]
models = [
ModelInput(
model_id=model,
model_type=ModelType.llm,
provider_id=providers["inference"][0].provider_id,
)
for model in inference_models
]
models.append(
ModelInput(
model_id="all-MiniLM-L6-v2",
model_type=ModelType.embedding,
provider_id="tools_memory_provider",
metadata={"embedding_dimension": 384},
)
)
test_stack = await construct_stack_for_test(
[
Api.tool_groups,
Api.inference,
Api.vector_io,
Api.tool_runtime,
],
providers,
provider_data,
models=models,
tool_groups=[
tool_group_input_tavily_search,
tool_group_input_wolfram_alpha,
tool_group_input_memory,
],
)
return test_stack

View file

@ -1,109 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import pytest
from llama_stack.apis.tools import RAGDocument, RAGQueryResult, ToolInvocationResult
from llama_stack.providers.datatypes import Api
@pytest.fixture
def sample_search_query():
return "What are the latest developments in quantum computing?"
@pytest.fixture
def sample_wolfram_alpha_query():
return "What is the square root of 16?"
@pytest.fixture
def sample_documents():
urls = [
"memory_optimizations.rst",
"chat.rst",
"llama3.rst",
"qat_finetune.rst",
"lora_finetune.rst",
]
return [
RAGDocument(
document_id=f"num-{i}",
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
mime_type="text/plain",
metadata={},
)
for i, url in enumerate(urls)
]
class TestTools:
@pytest.mark.asyncio
async def test_web_search_tool(self, tools_stack, sample_search_query):
"""Test the web search tool functionality."""
if "TAVILY_SEARCH_API_KEY" not in os.environ:
pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
tools_impl = tools_stack.impls[Api.tool_runtime]
# Execute the tool
response = await tools_impl.invoke_tool(tool_name="web_search", kwargs={"query": sample_search_query})
# Verify the response
assert isinstance(response, ToolInvocationResult)
assert response.content is not None
assert len(response.content) > 0
assert isinstance(response.content, str)
@pytest.mark.asyncio
async def test_wolfram_alpha_tool(self, tools_stack, sample_wolfram_alpha_query):
"""Test the wolfram alpha tool functionality."""
if "WOLFRAM_ALPHA_API_KEY" not in os.environ:
pytest.skip("WOLFRAM_ALPHA_API_KEY not set, skipping test")
tools_impl = tools_stack.impls[Api.tool_runtime]
response = await tools_impl.invoke_tool(tool_name="wolfram_alpha", kwargs={"query": sample_wolfram_alpha_query})
# Verify the response
assert isinstance(response, ToolInvocationResult)
assert response.content is not None
assert len(response.content) > 0
assert isinstance(response.content, str)
@pytest.mark.asyncio
async def test_rag_tool(self, tools_stack, sample_documents):
"""Test the memory tool functionality."""
vector_dbs_impl = tools_stack.impls[Api.vector_dbs]
tools_impl = tools_stack.impls[Api.tool_runtime]
# Register memory bank
await vector_dbs_impl.register_vector_db(
vector_db_id="test_bank",
embedding_model="all-MiniLM-L6-v2",
embedding_dimension=384,
provider_id="faiss",
)
# Insert documents into memory
await tools_impl.rag_tool.insert(
documents=sample_documents,
vector_db_id="test_bank",
chunk_size_in_tokens=512,
)
# Execute the memory tool
response = await tools_impl.rag_tool.query(
content="What are the main topics covered in the documentation?",
vector_db_ids=["test_bank"],
)
# Verify the response
assert isinstance(response, RAGQueryResult)
assert response.content is not None
assert len(response.content) > 0