[Evals API][4/n] evals with generation meta-reference impl (#303)

* wip

* dataset validation

* test_scoring

* cleanup

* clean up test

* comments

* error checking

* dataset client

* test client:

* datasetio client

* clean up

* basic scoring function works

* scorer wip

* equality scorer

* score batch impl

* score batch

* update scoring test

* refactor

* validate scorer input

* address comments

* evals with generation

* add all rows scores to ScoringResult

* minor typing

* bugfix

* scoring function def rename

* rebase name

* refactor

* address comments

* Update iOS inference instructions for new quantization

* Small updates to quantization config

* Fix score threshold in faiss

* Bump version to 0.0.45

* Handle both ipv6 and ipv4 interfaces together

* update manifest for build templates

* Update getting_started.md

* chatcompletion & completion input type validation

* inclusion->subsetof

* error checking

* scoring_function -> scoring_fn rename, scorer -> scoring_fn rename

* address comments

* [Evals API][5/n] fixes to generate openapi spec (#323)

* generate openapi

* typing comment, dataset -> dataset_id

* remove custom type

* sample eval run.yaml

---------

Co-authored-by: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Xi Yan 2024-10-25 13:12:39 -07:00 committed by GitHub
parent 426d821e7f
commit abdf7cddf3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 3371 additions and 1296 deletions

View file

@ -1,6 +1,6 @@
input_query,generated_answer,expected_answer
What is the capital of France?,London,Paris
Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg
What is the largest planet in our solar system?,Jupiter,Jupiter
What is the smallest country in the world?,China,Vatican City
What is the currency of Japan?,Yen,Yen
input_query,generated_answer,expected_answer,chat_completion_input
What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"

1 input_query generated_answer expected_answer chat_completion_input
2 What is the capital of France? London Paris [{'role': 'user', 'content': 'What is the capital of France?'}]
3 Who is the CEO of Meta? Mark Zuckerberg Mark Zuckerberg [{'role': 'user', 'content': 'Who is the CEO of Meta?'}]
4 What is the largest planet in our solar system? Jupiter Jupiter [{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]
5 What is the smallest country in the world? China Vatican City [{'role': 'user', 'content': 'What is the smallest country in the world?'}]
6 What is the currency of Japan? Yen Yen [{'role': 'user', 'content': 'What is the currency of Japan?'}]

View file

@ -61,20 +61,31 @@ def data_url_from_file(file_path: str) -> str:
return data_url
async def register_dataset(datasets_impl: Datasets):
async def register_dataset(
datasets_impl: Datasets, for_generation=False, dataset_id="test_dataset"
):
test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
test_url = data_url_from_file(str(test_file))
if for_generation:
dataset_schema = {
"expected_answer": StringType(),
"chat_completion_input": ChatCompletionInputType(),
}
else:
dataset_schema = {
"expected_answer": StringType(),
"input_query": StringType(),
"generated_answer": StringType(),
}
dataset = DatasetDefWithProvider(
identifier="test_dataset",
identifier=dataset_id,
provider_id=os.environ["PROVIDER_ID"],
url=URL(
uri=test_url,
),
dataset_schema={
"generated_answer": StringType(),
"expected_answer": StringType(),
"input_query": StringType(),
},
dataset_schema=dataset_schema,
)
await datasets_impl.register_dataset(dataset)

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,18 @@
providers:
datasetio:
- provider_id: test-meta
provider_type: meta-reference
config: {}
scoring:
- provider_id: test-meta
provider_type: meta-reference
config: {}
eval:
- provider_id: test-meta
provider_type: meta-reference
config: {}
inference:
- provider_id: test-tgi
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009

View file

@ -0,0 +1,79 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
import pytest_asyncio
from llama_stack.apis.common.type_system import * # noqa: F403
from llama_stack.apis.datasetio import * # noqa: F403
from llama_stack.apis.eval.eval import ModelCandidate
from llama_stack.distribution.datatypes import * # noqa: F403
from llama_models.llama3.api import SamplingParams
from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
from llama_stack.providers.tests.resolver import resolve_impls_for_test
# How to run this test:
#
# 1. Ensure you have a conda with the right dependencies installed. This is a bit tricky
# since it depends on the provider you are testing. On top of that you need
# `pytest` and `pytest-asyncio` installed.
#
# 2. Copy and modify the provider_config_example.yaml depending on the provider you are testing.
#
# 3. Run:
#
# ```bash
# PROVIDER_ID=<your_provider> \
# PROVIDER_CONFIG=provider_config.yaml \
# pytest -s llama_stack/providers/tests/eval/test_eval.py \
# --tb=short --disable-warnings
# ```
@pytest_asyncio.fixture(scope="session")
async def eval_settings():
impls = await resolve_impls_for_test(
Api.eval, deps=[Api.datasetio, Api.scoring, Api.inference]
)
return {
"eval_impl": impls[Api.eval],
"scoring_impl": impls[Api.scoring],
"datasets_impl": impls[Api.datasets],
}
@pytest.mark.asyncio
async def test_eval(eval_settings):
datasets_impl = eval_settings["datasets_impl"]
await register_dataset(
datasets_impl,
for_generation=True,
dataset_id="test_dataset_for_eval",
)
response = await datasets_impl.list_datasets()
assert len(response) == 1
eval_impl = eval_settings["eval_impl"]
response = await eval_impl.evaluate_batch(
dataset_id=response[0].identifier,
candidate=ModelCandidate(
model="Llama3.2-1B-Instruct",
sampling_params=SamplingParams(),
),
scoring_functions=["subset_of"],
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(response.job_id)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 5
assert "subset_of" in eval_response.scores