work eval

This commit is contained in:
Xi Yan 2025-03-05 17:12:47 -08:00
parent fd68b0dc9a
commit 6e65b9282d
2 changed files with 50 additions and 9 deletions

View file

@ -1,6 +1,6 @@
input_query,generated_answer,expected_answer,chat_completion_input input_query,generated_answer,expected_answer,chat_completion_input
What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]" What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]" Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]" What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]" What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]" What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"

1 input_query generated_answer expected_answer chat_completion_input
2 What is the capital of France? London Paris [{'role': 'user', 'content': 'What is the capital of France?'}] [{"role": "user", "content": "What is the capital of France?"}]
3 Who is the CEO of Meta? Mark Zuckerberg Mark Zuckerberg [{'role': 'user', 'content': 'Who is the CEO of Meta?'}] [{"role": "user", "content": "Who is the CEO of Meta?"}]
4 What is the largest planet in our solar system? Jupiter Jupiter [{'role': 'user', 'content': 'What is the largest planet in our solar system?'}] [{"role": "user", "content": "What is the largest planet in our solar system?"}]
5 What is the smallest country in the world? China Vatican City [{'role': 'user', 'content': 'What is the smallest country in the world?'}] [{"role": "user", "content": "What is the smallest country in the world?"}]
6 What is the currency of Japan? Yen Yen [{'role': 'user', 'content': 'What is the currency of Japan?'}] [{"role": "user", "content": "What is the currency of Japan?"}]

View file

@ -3,17 +3,58 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import uuid
import pytest
from ..datasetio.test_datasetio import register_dataset
# How to run this test: # How to run this test:
# #
# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval
def test_benchmarks_list(llama_stack_client): @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
response = llama_stack_client.benchmarks.list() def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
assert isinstance(response, list) register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval")
assert len(response) == 0 response = llama_stack_client.datasets.list()
assert any(x.identifier == "test_dataset_for_eval" for x in response)
rows = llama_stack_client.datasetio.get_rows_paginated(
dataset_id="test_dataset_for_eval",
rows_in_page=3,
)
assert len(rows.rows) == 3
scoring_functions = [
scoring_fn_id,
]
benchmark_id = str(uuid.uuid4())
llama_stack_client.benchmarks.register(
benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
list_benchmarks = llama_stack_client.benchmarks.list()
assert any(x.identifier == benchmark_id for x in list_benchmarks)
response = llama_stack_client.eval.evaluate_rows(
benchmark_id=benchmark_id,
input_rows=rows.rows,
scoring_functions=scoring_functions,
benchmark_config={
"eval_candidate": {
"type": "model",
"model": text_model_id,
"sampling_params": {
"temperature": 0.0,
},
},
},
)
assert len(response.generations) == 3
assert scoring_fn_id in response.scores
# @pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API") # @pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API")