mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-15 22:47:59 +00:00
fix eval
This commit is contained in:
parent
56239fce90
commit
413a1b6d8b
13 changed files with 293 additions and 15 deletions
|
@ -6,13 +6,16 @@
|
|||
from enum import Enum
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
|
||||
from .....apis.common.job_types import Job
|
||||
from .....apis.eval.eval import BenchmarkEvalTaskConfig
|
||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||
from llama_stack.apis.common.job_types import Job
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval import Eval, EvalCandidate, EvaluateResponse, JobStatus
|
||||
from llama_stack.apis.eval import * # noqa: F403
|
||||
from llama_stack.apis.eval_tasks import EvalTaskDef
|
||||
from llama_stack.apis.inference import Inference
|
||||
from llama_stack.apis.scoring import Scoring
|
||||
from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
|
||||
|
||||
from .config import MetaReferenceEvalConfig
|
||||
|
||||
|
@ -25,7 +28,7 @@ class ColumnName(Enum):
|
|||
generated_answer = "generated_answer"
|
||||
|
||||
|
||||
class MetaReferenceEvalImpl(Eval):
|
||||
class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
|
||||
def __init__(
|
||||
self,
|
||||
config: MetaReferenceEvalConfig,
|
||||
|
@ -47,6 +50,10 @@ class MetaReferenceEvalImpl(Eval):
|
|||
|
||||
async def shutdown(self) -> None: ...
|
||||
|
||||
async def list_eval_tasks(self) -> List[EvalTaskDefWithProvider]:
|
||||
print("HHHH")
|
||||
return []
|
||||
|
||||
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
||||
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
||||
if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
|
||||
|
@ -70,12 +77,22 @@ class MetaReferenceEvalImpl(Eval):
|
|||
f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}"
|
||||
)
|
||||
|
||||
async def evaluate_batch(
|
||||
async def run_benchmark_eval(
|
||||
self,
|
||||
dataset_id: str,
|
||||
candidate: EvalCandidate,
|
||||
scoring_functions: List[str],
|
||||
benchmark_id: str,
|
||||
eval_task_config: BenchmarkEvalTaskConfig,
|
||||
) -> Job:
|
||||
raise NotImplementedError("Benchmark eval is not implemented yet")
|
||||
|
||||
async def run_eval(
|
||||
self,
|
||||
eval_task_def: EvalTaskDef,
|
||||
eval_task_config: EvalTaskConfig,
|
||||
) -> Job:
|
||||
dataset_id = eval_task_def.dataset_id
|
||||
candidate = eval_task_config.eval_candidate
|
||||
scoring_functions = eval_task_def.scoring_functions
|
||||
|
||||
await self.validate_eval_input_dataset_schema(dataset_id=dataset_id)
|
||||
all_rows = await self.datasetio_api.get_rows_paginated(
|
||||
dataset_id=dataset_id,
|
||||
|
@ -93,12 +110,13 @@ class MetaReferenceEvalImpl(Eval):
|
|||
self.jobs[job_id] = res
|
||||
return Job(job_id=job_id)
|
||||
|
||||
async def evaluate(
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
candidate: EvalCandidate,
|
||||
scoring_functions: List[str],
|
||||
eval_task_config: EvalTaskConfig,
|
||||
) -> EvaluateResponse:
|
||||
candidate = eval_task_config.eval_candidate
|
||||
if candidate.type == "agent":
|
||||
raise NotImplementedError(
|
||||
"Evaluation with generation has not been implemented for agents"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue