forked from phoenix-oss/llama-stack-mirror
# What does this PR do? - To make it easier, delete existing `eval/scoring/scoring_function` apis. There will be a bunch of broken impls here. The sequence is: 1. migrate benchmark graders 2. clean up existing scoring functions - Add a skeleton evaluation impl to make tests pass. ## Test Plan tested in following PRs [//]: # (## Documentation)
71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from llama_stack.apis.agents import Agents
|
|
from llama_stack.apis.datasetio import DatasetIO
|
|
from llama_stack.apis.datasets import Datasets
|
|
from llama_stack.apis.inference import Inference
|
|
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
|
|
|
|
from .....apis.benchmarks import Benchmark
|
|
from .....apis.evaluation.evaluation import (
|
|
Evaluation,
|
|
EvaluationCandidate,
|
|
EvaluationJob,
|
|
EvaluationResponse,
|
|
EvaluationTask,
|
|
)
|
|
from .config import MetaReferenceEvaluationConfig
|
|
|
|
EVAL_TASKS_PREFIX = "benchmarks:"
|
|
|
|
|
|
class MetaReferenceEvaluationImpl(
|
|
Evaluation,
|
|
BenchmarksProtocolPrivate,
|
|
):
|
|
def __init__(
|
|
self,
|
|
config: MetaReferenceEvaluationConfig,
|
|
datasetio_api: DatasetIO,
|
|
datasets_api: Datasets,
|
|
inference_api: Inference,
|
|
agents_api: Agents,
|
|
) -> None:
|
|
self.config = config
|
|
self.datasetio_api = datasetio_api
|
|
self.datasets_api = datasets_api
|
|
self.inference_api = inference_api
|
|
self.agents_api = agents_api
|
|
|
|
async def initialize(self) -> None:
|
|
pass
|
|
|
|
async def shutdown(self) -> None:
|
|
pass
|
|
|
|
async def register_benchmark(self, benchmark: Benchmark) -> None:
|
|
pass
|
|
|
|
async def run(
|
|
self,
|
|
task: EvaluationTask,
|
|
candidate: EvaluationCandidate,
|
|
) -> EvaluationJob:
|
|
raise NotImplementedError("Run is not implemented yet")
|
|
|
|
async def run_sync(
|
|
self,
|
|
task: EvaluationTask,
|
|
candidate: EvaluationCandidate,
|
|
) -> EvaluationResponse:
|
|
raise NotImplementedError("Run sync is not implemented yet")
|
|
|
|
async def grade(self, task: EvaluationTask) -> EvaluationJob:
|
|
raise NotImplementedError("Grade is not implemented yet")
|
|
|
|
async def grade_sync(self, task: EvaluationTask) -> EvaluationResponse:
|
|
raise NotImplementedError("Grade sync is not implemented yet")
|