feat(eval api): (2.2/n) delete eval / scoring / scoring_fn apis (#1700)

# What does this PR do? - To make it easier, delete existing `eval/scoring/scoring_function` apis. There will be a bunch of broken impls here. The sequence is: 1. migrate benchmark graders 2. clean up existing scoring functions - Add a skeleton evaluation impl to make tests pass. ## Test Plan tested in following PRs [//]: # (## Documentation)
2025-03-19 11:04:23 -07:00 · 2025-03-19 11:04:23 -07:00 · c1d18283d2
commit c1d18283d2
parent 0048274ec0
113 changed files with 408 additions and 3900 deletions
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -20,10 +20,9 @@ class Api(Enum):
    agents = "agents"
    vector_io = "vector_io"
    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
    post_training = "post_training"
    tool_runtime = "tool_runtime"
+    evaluation = "evaluation"

    telemetry = "telemetry"

@ -31,7 +30,6 @@ class Api(Enum):
    shields = "shields"
    vector_dbs = "vector_dbs"
    datasets = "datasets"
-    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"

--- a/llama_stack/apis/eval/init.py
+++ b/llama_stack/apis/eval/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .eval import *  # noqa: F401 F403
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -1,145 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict, List, Literal, Optional, Protocol, Union
-
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated
-
-from llama_stack.apis.agents import AgentConfig
-from llama_stack.apis.common.job_types import Job, JobStatus
-from llama_stack.apis.inference import SamplingParams, SystemMessage
-from llama_stack.apis.scoring import ScoringResult
-from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class ModelCandidate(BaseModel):
-    """A model candidate for evaluation.
-
-    :param model: The model ID to evaluate.
-    :param sampling_params: The sampling parameters for the model.
-    :param system_message: (Optional) The system message providing instructions or context to the model.
-    """
-
-    type: Literal["model"] = "model"
-    model: str
-    sampling_params: SamplingParams
-    system_message: Optional[SystemMessage] = None
-
-
-@json_schema_type
-class AgentCandidate(BaseModel):
-    """An agent candidate for evaluation.
-
-    :param config: The configuration for the agent candidate.
-    """
-
-    type: Literal["agent"] = "agent"
-    config: AgentConfig
-
-
-EvalCandidate = register_schema(
-    Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
-    name="EvalCandidate",
-)
-
-
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation.
-
-    :param eval_candidate: The candidate to evaluate.
-    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
-    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
-    """
-
-    eval_candidate: EvalCandidate
-    scoring_params: Dict[str, ScoringFnParams] = Field(
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-        default_factory=dict,
-    )
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-    # we could optinally add any specific dataset config here
-
-
-@json_schema_type
-class EvaluateResponse(BaseModel):
-    """The response from an evaluation.
-
-    :param generations: The generations from the evaluation.
-    :param scores: The scores from the evaluation.
-    """
-
-    generations: List[Dict[str, Any]]
-    # each key in the dict is a scoring function name
-    scores: Dict[str, ScoringResult]
-
-
-class Eval(Protocol):
-    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
-    async def run_eval(
-        self,
-        benchmark_id: str,
-        benchmark_config: BenchmarkConfig,
-    ) -> Job:
-        """Run an evaluation on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
-        :return: The job that was created to run the evaluation.
-        """
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
-    async def evaluate_rows(
-        self,
-        benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
-        benchmark_config: BenchmarkConfig,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param input_rows: The rows to evaluate.
-        :param scoring_functions: The scoring functions to use for the evaluation.
-        :param benchmark_config: The configuration for the benchmark.
-        :return: EvaluateResponse object containing generations and scores
-        """
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, benchmark_id: str, job_id: str) -> JobStatus:
-        """Get the status of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the status of.
-        :return: The status of the evaluationjob.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
-        """Cancel a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to cancel.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-        """Get the result of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the result of.
-        :return: The result of the job.
-        """
--- a/llama_stack/apis/scoring/init.py
+++ b/llama_stack/apis/scoring/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring import *  # noqa: F401 F403
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -1,78 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-# mapping of metric to value
-ScoringResultRow = Dict[str, Any]
-
-
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param score_rows: The scoring result for each row. Each row is a map of column name to value.
-    :param aggregated_results: Map of metric name to aggregated value
-    """
-
-    score_rows: List[ScoringResultRow]
-    # aggregated metrics to value
-    aggregated_results: Dict[str, Any]
-
-
-@json_schema_type
-class ScoreBatchResponse(BaseModel):
-    dataset_id: Optional[str] = None
-    results: Dict[str, ScoringResult]
-
-
-@json_schema_type
-class ScoreResponse(BaseModel):
-    """
-    The response from scoring.
-
-    :param results: A map of scoring function name to ScoringResult.
-    """
-
-    # each key in the dict is a scoring function name
-    results: Dict[str, ScoringResult]
-
-
-class ScoringFunctionStore(Protocol):
-    def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
-
-
-@runtime_checkable
-class Scoring(Protocol):
-    scoring_function_store: ScoringFunctionStore
-
-    @webmethod(route="/scoring/score-batch", method="POST")
-    async def score_batch(
-        self,
-        dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
-        save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse: ...
-
-    @webmethod(route="/scoring/score", method="POST")
-    async def score(
-        self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
-    ) -> ScoreResponse:
-        """Score a list of rows.
-
-        :param input_rows: The rows to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
-        :return: ScoreResponse object containing rows and aggregated results
-        """
-        ...
--- a/llama_stack/apis/scoring_functions/init.py
+++ b/llama_stack/apis/scoring_functions/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring_functions import *  # noqa: F401 F403
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -1,149 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Protocol,
-    Union,
-    runtime_checkable,
-)
-
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated
-
-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-# Perhaps more structure can be imposed on these functions. Maybe they could be associated
-# with standard metrics so they can be rolled up?
-@json_schema_type
-class ScoringFnParamsType(Enum):
-    llm_as_judge = "llm_as_judge"
-    regex_parser = "regex_parser"
-    basic = "basic"
-
-
-@json_schema_type
-class AggregationFunctionType(Enum):
-    average = "average"
-    median = "median"
-    categorical_count = "categorical_count"
-    accuracy = "accuracy"
-
-
-@json_schema_type
-class LLMAsJudgeScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
-    judge_model: str
-    prompt_template: Optional[str] = None
-    judge_score_regexes: Optional[List[str]] = Field(
-        description="Regexes to extract the answer from generated response",
-        default_factory=list,
-    )
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
-    parsing_regexes: Optional[List[str]] = Field(
-        description="Regex to extract the answer from generated response",
-        default_factory=list,
-    )
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-@json_schema_type
-class BasicScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-ScoringFnParams = register_schema(
-    Annotated[
-        Union[
-            LLMAsJudgeScoringFnParams,
-            RegexParserScoringFnParams,
-            BasicScoringFnParams,
-        ],
-        Field(discriminator="type"),
-    ],
-    name="ScoringFnParams",
-)
-
-
-class CommonScoringFnFields(BaseModel):
-    description: Optional[str] = None
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this definition",
-    )
-    return_type: ParamType = Field(
-        description="The return type of the deterministic function",
-    )
-    params: Optional[ScoringFnParams] = Field(
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
-        default=None,
-    )
-
-
-@json_schema_type
-class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
-
-    @property
-    def scoring_fn_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_scoring_fn_id(self) -> str:
-        return self.provider_resource_id
-
-
-class ScoringFnInput(CommonScoringFnFields, BaseModel):
-    scoring_fn_id: str
-    provider_id: Optional[str] = None
-    provider_scoring_fn_id: Optional[str] = None
-
-
-class ListScoringFunctionsResponse(BaseModel):
-    data: List[ScoringFn]
-
-
-@runtime_checkable
-class ScoringFunctions(Protocol):
-    @webmethod(route="/scoring-functions", method="GET")
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
-
-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ...
-
-    @webmethod(route="/scoring-functions", method="POST")
-    async def register_scoring_function(
-        self,
-        scoring_fn_id: str,
-        description: str,
-        return_type: ParamType,
-        provider_scoring_fn_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[ScoringFnParams] = None,
-    ) -> None: ...