forked from phoenix-oss/llama-stack-mirror
155 lines
6.5 KiB
Python
155 lines
6.5 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
from typing import Any, Dict, List, Literal, Optional, Protocol, Union
|
|
|
|
from pydantic import BaseModel, Field
|
|
from typing_extensions import Annotated
|
|
|
|
from llama_stack.apis.agents import AgentConfig
|
|
from llama_stack.apis.common.job_types import CommonJobFields, JobType
|
|
from llama_stack.apis.datasets import DataSource
|
|
from llama_stack.apis.inference import SamplingParams, SystemMessage
|
|
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
|
|
|
|
|
@json_schema_type
|
|
class ModelCandidate(BaseModel):
|
|
"""A model candidate for evaluation.
|
|
|
|
:param model: The model ID to evaluate.
|
|
:param sampling_params: The sampling parameters for the model.
|
|
:param system_message: (Optional) The system message providing instructions or context to the model.
|
|
"""
|
|
|
|
type: Literal["model"] = "model"
|
|
model_id: str
|
|
sampling_params: SamplingParams
|
|
system_message: Optional[SystemMessage] = None
|
|
|
|
|
|
@json_schema_type
|
|
class AgentCandidate(BaseModel):
|
|
"""An agent candidate for evaluation.
|
|
|
|
:param config: The configuration for the agent candidate.
|
|
"""
|
|
|
|
type: Literal["agent"] = "agent"
|
|
agent_config: AgentConfig
|
|
|
|
|
|
EvaluationCandidate = register_schema(
|
|
Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
|
|
name="EvaluationCandidate",
|
|
)
|
|
|
|
|
|
@json_schema_type
|
|
class EvaluationTask(BaseModel):
|
|
"""
|
|
A task for evaluation. To specify a task, one of the following must be provided:
|
|
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
|
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
|
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
|
|
|
|
:param benchmark_id: The benchmark ID to evaluate.
|
|
:param dataset_id: The dataset ID to evaluate.
|
|
:param data_source: The data source to evaluate.
|
|
:param grader_ids: The grader IDs to evaluate.
|
|
"""
|
|
|
|
benchmark_id: Optional[str] = None
|
|
dataset_id: Optional[str] = None
|
|
data_source: Optional[DataSource] = None
|
|
grader_ids: Optional[List[str]] = None
|
|
|
|
|
|
@json_schema_type
|
|
class EvaluationJob(CommonJobFields):
|
|
type: Literal[JobType.evaluation.value] = JobType.evaluation.value
|
|
|
|
# input params for the submitted evaluation job
|
|
task: EvaluationTask
|
|
candidate: EvaluationCandidate
|
|
|
|
|
|
@json_schema_type
|
|
class EvaluationResponse(BaseModel):
|
|
"""
|
|
A response to an inline evaluation.
|
|
|
|
:param result_rows: The result data containing inputs, generations and grades in each row.
|
|
:param grades: Map of grader id to aggregated value.
|
|
"""
|
|
|
|
result_rows: List[Dict[str, Any]]
|
|
grades: Dict[str, Any]
|
|
|
|
|
|
class Evaluation(Protocol):
|
|
@webmethod(route="/evaluation/run", method="POST")
|
|
async def run(
|
|
self,
|
|
task: EvaluationTask,
|
|
candidate: EvaluationCandidate,
|
|
) -> EvaluationJob:
|
|
"""
|
|
Schedule a full evaluation job, by generating results using candidate and grading them.
|
|
|
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
|
:param candidate: The candidate to evaluate.
|
|
"""
|
|
...
|
|
|
|
@webmethod(route="/evaluation/run_sync", method="POST")
|
|
async def run_sync(
|
|
self,
|
|
task: EvaluationTask,
|
|
candidate: EvaluationCandidate,
|
|
) -> EvaluationResponse:
|
|
"""
|
|
Run an evaluation synchronously, i.e., without scheduling a job".
|
|
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
|
|
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
|
:param candidate: The candidate to evaluate.
|
|
"""
|
|
...
|
|
|
|
@webmethod(route="/evaluation/grade", method="POST")
|
|
async def grade(self, task: EvaluationTask) -> EvaluationJob:
|
|
"""
|
|
Schedule a grading job, by grading generated (model or agent) results. The generated results are expected to be in the dataset.
|
|
|
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
|
|
|
:return: The evaluation job containing grader scores.
|
|
"""
|
|
...
|
|
|
|
@webmethod(route="/evaluation/grade_sync", method="POST")
|
|
async def grade_sync(self, task: EvaluationTask) -> EvaluationResponse:
|
|
"""
|
|
Run grading synchronously on generated results, i.e., without scheduling a job.
|
|
You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
|
|
|
|
:param task: The task to evaluate. To specify a task, one of the following must be provided:
|
|
- `benchmark_id`: Run evaluation task against a benchmark_id
|
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids
|
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
|
|
|
|
:return: The evaluation job containing grader scores. "generations" is not populated in the response.
|
|
"""
|
|
...
|