new apis

2025-03-16 19:33:57 -07:00 · 2025-03-16 19:33:57 -07:00 · 035b2dcb60
commit 035b2dcb60
parent d34b70e3ab
9 changed files with 2365 additions and 2190 deletions
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_stack.apis.agents import AgentConfig
+from llama_stack.apis.common.job_types import CommonJobFields, JobType
+from llama_stack.apis.datasets import DataSource
+from llama_stack.apis.inference import SamplingParams, SystemMessage
+from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+
+
+@json_schema_type
+class ModelCandidate(BaseModel):
+    """A model candidate for evaluation.
+
+    :param model: The model ID to evaluate.
+    :param sampling_params: The sampling parameters for the model.
+    :param system_message: (Optional) The system message providing instructions or context to the model.
+    """
+
+    type: Literal["model"] = "model"
+    model_id: str
+    sampling_params: SamplingParams
+    system_message: Optional[SystemMessage] = None
+
+
+@json_schema_type
+class AgentCandidate(BaseModel):
+    """An agent candidate for evaluation.
+
+    :param config: The configuration for the agent candidate.
+    """
+
+    type: Literal["agent"] = "agent"
+    config: AgentConfig
+
+
+EvaluationCandidate = register_schema(
+    Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
+    name="EvaluationCandidate",
+)
+
+
+@json_schema_type
+class BenchmarkTask(BaseModel):
+    type: Literal["benchmark_id"] = "benchmark_id"
+    benchmark_id: str
+
+
+@json_schema_type
+class DatasetGraderTask(BaseModel):
+    type: Literal["dataset_grader"] = "dataset_grader"
+    dataset_id: str
+    grader_ids: List[str]
+
+
+@json_schema_type
+class DataSourceGraderTask(BaseModel):
+    type: Literal["data_source_grader"] = "data_source_grader"
+    data_source: DataSource
+    grader_ids: List[str]
+
+
+EvaluationTask = register_schema(
+    Annotated[
+        Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+        Field(discriminator="type"),
+    ],
+    name="EvaluationTask",
+)
+
+
+@json_schema_type
+class EvaluationJob(CommonJobFields):
+    type: Literal[JobType.evaluation.value] = JobType.evaluation.value
+
+    # input params for the submitted evaluation job
+    task: EvaluationTask
+    candidate: EvaluationCandidate
+
+
+@json_schema_type
+class ScoringResult(BaseModel):
+    """
+    A scoring result for a single row.
+
+    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
+    :param metrics: Map of metric name to aggregated value.
+    """
+
+    scores: List[Dict[str, Any]]
+    metrics: Dict[str, Any]
+
+
+@json_schema_type
+class EvaluationResponse(BaseModel):
+    """
+    A response to an inline evaluation.
+
+    :param generations: The generations in rows for the evaluation.
+    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    """
+
+    generations: List[Dict[str, Any]]
+    scores: Dict[str, ScoringResult]
+
+
+class Evaluation(Protocol):
+    @webmethod(route="/evaluation/run", method="POST")
+    async def run(
+        self,
+        task: EvaluationTask,
+        candidate: EvaluationCandidate,
+    ) -> EvaluationJob:
+        """
+        Run an evaluation job.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param candidate: The candidate to evaluate.
+        """
+        ...
+
+    @webmethod(route="/evaluation/run_inline", method="POST")
+    async def run_inline(
+        self,
+        task: EvaluationTask,
+        candidate: EvaluationCandidate,
+    ) -> EvaluationResponse:
+        """
+        Run an evaluation job inline.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param candidate: The candidate to evaluate.
+        """
+        ...
+
+    @webmethod(route="/evaluation/grade", method="POST")
+    async def grade(self, task: EvaluationTask) -> EvaluationJob:
+        """
+        Run an grading job with generated results. Use this when you have generated results from inference in a dataset.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+        :return: The evaluation job containing grader scores.
+        """
+        ...
+
+    @webmethod(route="/evaluation/grade_inline", method="POST")
+    async def grade_inline(self, task: EvaluationTask) -> EvaluationResponse:
+        """
+        Run an grading job with generated results inline.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+        :return: The evaluation job containing grader scores. "generations" is not populated in the response.
+        """
+        ...