Merge branch 'eval_task_register' into mmlu_benchmark

This commit is contained in:
Xi Yan 2024-11-07 14:41:50 -08:00
commit cc6edf6287
72 changed files with 306 additions and 304 deletions

View file

@ -11,7 +11,7 @@ import pytest_asyncio
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.inline.meta_reference.agents import (
from llama_stack.providers.inline.agents.meta_reference import (
MetaReferenceAgentsImplConfig,
)

View file

@ -52,7 +52,7 @@ class Testeval:
response = await eval_impl.evaluate_rows(
input_rows=rows.rows,
scoring_functions=scoring_functions,
eval_task_config=AppEvalTaskConfig(
task_config=AppEvalTaskConfig(
eval_candidate=ModelCandidate(
model="Llama3.2-3B-Instruct",
sampling_params=SamplingParams(),
@ -76,13 +76,13 @@ class Testeval:
]
response = await eval_impl.run_eval(
eval_task_def=EvalTaskDef(
task=EvalTaskDef(
# NOTE: this is needed to make the router work for all app evals
identifier="meta-reference::app_eval",
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
),
eval_task_config=AppEvalTaskConfig(
task_config=AppEvalTaskConfig(
eval_candidate=ModelCandidate(
model="Llama3.2-3B-Instruct",
sampling_params=SamplingParams(),
@ -90,9 +90,13 @@ class Testeval:
),
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(response.job_id)
job_status = await eval_impl.job_status(
response.job_id, "meta-reference::app_eval"
)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(response.job_id)
eval_response = await eval_impl.job_result(
response.job_id, "meta-reference::app_eval"
)
assert eval_response is not None
assert len(eval_response.generations) == 5

View file

@ -10,7 +10,7 @@ import pytest
import pytest_asyncio
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.inline.meta_reference.inference import (
from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceInferenceConfig,
)

View file

@ -11,7 +11,7 @@ import pytest
import pytest_asyncio
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.inline.meta_reference.memory import FaissImplConfig
from llama_stack.providers.inline.memory.faiss import FaissImplConfig
from llama_stack.providers.remote.memory.pgvector import PGVectorConfig
from llama_stack.providers.remote.memory.weaviate import WeaviateConfig

View file

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio
from llama_stack.distribution.datatypes import Api, Provider
from llama_stack.providers.inline.meta_reference.safety import (
from llama_stack.providers.inline.safety.meta_reference import (
LlamaGuardShieldConfig,
SafetyConfig,
)

View file

@ -44,10 +44,10 @@ class TestScoring:
)
assert len(rows.rows) == 3
scoring_functions = [
"meta-reference::llm_as_judge_8b_correctness",
"meta-reference::equality",
]
scoring_functions = {
"meta-reference::llm_as_judge_8b_correctness": None,
"meta-reference::equality": None,
}
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
@ -83,7 +83,7 @@ class TestScoring:
)
assert len(rows.rows) == 3
params = {
scoring_functions = {
"meta-reference::llm_as_judge_8b_correctness": LLMAsJudgeScoringFnParams(
judge_model="Llama3.1-405B-Instruct",
prompt_template="Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9.",
@ -91,13 +91,9 @@ class TestScoring:
)
}
scoring_functions = [
"meta-reference::llm_as_judge_8b_correctness",
]
response = await scoring_impl.score(
input_rows=rows.rows,
scoring_functions=scoring_functions,
scoring_params=params,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions:
@ -108,7 +104,6 @@ class TestScoring:
response = await scoring_impl.score_batch(
dataset_id="test_dataset",
scoring_functions=scoring_functions,
scoring_params=params,
)
assert len(response.results) == len(scoring_functions)
for x in scoring_functions: