llama-stack-mirror/llama_stack/apis/eval/eval.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from typing import Dict, Literal, Optional, Protocol, Union

from pydantic import BaseModel, Field
from typing_extensions import Annotated

from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import BaseJob
from llama_stack.apis.inference import SamplingParams, SystemMessage
from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@json_schema_type
class ModelCandidate(BaseModel):
    """A model candidate for evaluation.

    :param model: The model ID to evaluate.
    :param sampling_params: The sampling parameters for the model.
    :param system_message: (Optional) The system message providing instructions or context to the model.
    """

    type: Literal["model"] = "model"
    model: str
    sampling_params: SamplingParams
    system_message: Optional[SystemMessage] = None


@json_schema_type
class AgentCandidate(BaseModel):
    """An agent candidate for evaluation.

    :param config: The configuration for the agent candidate.
    """

    type: Literal["agent"] = "agent"
    config: AgentConfig


EvalCandidate = Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")]
register_schema(EvalCandidate, name="EvalCandidate")


class EvaluateJob(BaseJob, BaseModel):
    type: Literal["eval"] = "eval"


class ListEvaluateJobsResponse(BaseModel):
    data: list[EvaluateJob]


@json_schema_type
class BenchmarkConfig(BaseModel):
    """A benchmark configuration for evaluation.

    :param eval_candidate: The candidate to evaluate.
    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
    """

    eval_candidate: EvalCandidate
    scoring_params: Dict[str, ScoringFnParams] = Field(
        description="Map between scoring function id and parameters for each scoring function you want to run",
        default_factory=dict,
    )
    num_examples: Optional[int] = Field(
        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
        default=None,
    )
    # we could optinally add any specific dataset config here


class Eval(Protocol):
    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""

    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluate", method="POST")
    async def evaluate(
        self,
        benchmark_id: str,
        benchmark_config: BenchmarkConfig,
    ) -> EvaluateJob: ...

    # CRUD operations on running jobs
    @webmethod(route="/evaluate/jobs/{job_id:path}", method="GET")
    async def get_evaluate_job(self, job_id: str) -> EvaluateJob: ...

    @webmethod(route="/evaluate/jobs", method="GET")
    async def list_evaluate_jobs(self) -> ListEvaluateJobsResponse: ...

    @webmethod(route="/evaluate/jobs/{job_id:path}", method="POST")
    async def update_evaluate_job(self, job: EvaluateJob) -> EvaluateJob: ...

    @webmethod(route="/evaluate/job/{job_id:path}", method="DELETE")
    async def delete_evaluate_job(self, job_id: str) -> None: ...

    # Note: pause/resume/cancel are achieved as follows:
    # - POST with status=paused
    # - POST with status=resuming
    # - POST with status=cancelled