jobs eval scoring

This commit is contained in:
Xi Yan 2025-03-13 11:27:56 -07:00
parent 3a75799900
commit 7b4f7888f1
5 changed files with 1537 additions and 939 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,13 @@ class JobStatus(Enum):
cancelled = "cancelled"
class JobType(Enum):
batch_inference = "batch_inference"
scoring = "scoring"
evaluation = "evaluation"
post_training = "post_training"
@json_schema_type
class CommonJobFields(BaseModel):
"""Common fields for all jobs.

View file

@ -10,7 +10,7 @@ from pydantic import BaseModel, Field
from typing_extensions import Annotated
from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.common.job_types import CommonJobFields, JobStatus
from llama_stack.apis.inference import SamplingParams, SystemMessage
from llama_stack.apis.scoring import ScoringResult
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@ -61,15 +61,32 @@ class EvaluateResponse(BaseModel):
scores: Dict[str, ScoringResult]
@json_schema_type
class EvalJob(CommonJobFields):
type: Literal["eval"] = "eval"
result_files: List[str] = Field(
description="The file ids of the eval results.",
default_factory=list,
)
result_datasets: List[str] = Field(
description="The ids of the datasets containing the eval results.",
default_factory=list,
)
# how the job is created
benchmark_id: str = Field(description="The id of the benchmark to evaluate on.")
candidate: EvalCandidate = Field(description="The candidate to evaluate on.")
class Eval(Protocol):
"""Llama Stack Evaluation API for running evaluations on model and agent candidates."""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
@webmethod(route="/eval/jobs", method="POST")
async def evaluate_benchmark(
self,
benchmark_id: str,
candidate: EvalCandidate,
) -> Job:
) -> EvalJob:
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
@ -85,37 +102,42 @@ class Eval(Protocol):
candidate: EvalCandidate,
) -> EvaluateResponse:
"""Evaluate a list of rows on a candidate.
:param dataset_rows: The rows to evaluate.
:param scoring_fn_ids: The scoring function ids to use for the evaluation.
:param candidate: The candidate to evaluate on.
:return: EvaluateResponse object containing generations and scores
"""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
"""Get the status of a job.
@webmethod(route="/eval/jobs", method="GET")
async def list_eval_jobs(self) -> List[EvalJob]:
"""List all evaluation jobs.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the status of.
:return: The status of the evaluationjob.
:return: A list of evaluation jobs.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
@webmethod(route="/eval/job/{job_id}", method="GET")
async def get_eval_job(self, job_id: str) -> Optional[EvalJob]:
"""Get a job by id.
:param job_id: The id of the job to get.
:return: The job.
"""
...
@webmethod(route="/eval/job/{job_id}", method="DELETE")
async def delete_eval_job(self, job_id: str) -> Optional[EvalJob]:
"""Delete a job.
:param job_id: The id of the job to delete.
"""
...
@webmethod(route="/eval/job/{job_id}/cancel", method="POST")
async def cancel_eval_job(self, job_id: str) -> Optional[EvalJob]:
"""Cancel a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to cancel.
:param job_id: The id of the job to cancel.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
"""Get the result of a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the result of.
:return: The result of the job.
"""

View file

@ -4,10 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
from pydantic import BaseModel
from pydantic import BaseModel, Field
from llama_stack.apis.common.job_types import CommonJobFields, JobType
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.schema_utils import json_schema_type, webmethod
@ -47,6 +48,27 @@ class ScoreResponse(BaseModel):
results: Dict[str, ScoringResult]
@json_schema_type
class ScoringJob(CommonJobFields):
type: Literal["scoring"] = "scoring"
result_files: List[str] = Field(
description="The file ids of the scoring results.",
default_factory=list,
)
result_datasets: List[str] = Field(
description="The ids of the datasets containing the scoring results.",
default_factory=list,
)
# how the job is created
dataset_id: str = Field(description="The id of the dataset used for scoring.")
scoring_fn_ids: List[str] = Field(
description="The ids of the scoring functions used.",
default_factory=list,
)
class ScoringFunctionStore(Protocol):
def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
@ -60,7 +82,7 @@ class Scoring(Protocol):
self,
dataset_id: str,
scoring_fn_ids: List[str],
) -> ScoreBatchResponse: ...
) -> ScoringJob: ...
@webmethod(route="/scoring/rows", method="POST")
async def score(
@ -75,3 +97,36 @@ class Scoring(Protocol):
:return: ScoreResponse object containing rows and aggregated results
"""
...
@webmethod(route="/scoring/jobs", method="GET")
async def list_scoring_jobs(self) -> List[ScoringJob]:
"""List all scoring jobs.
:return: A list of scoring jobs.
"""
...
@webmethod(route="/scoring/job/{job_id}", method="GET")
async def get_scoring_job(self, job_id: str) -> Optional[ScoringJob]:
"""Get a job by id.
:param job_id: The id of the job to get.
:return: The job.
"""
...
@webmethod(route="/scoring/job/{job_id}", method="DELETE")
async def delete_scoring_job(self, job_id: str) -> Optional[ScoringJob]:
"""Delete a job.
:param job_id: The id of the job to delete.
"""
...
@webmethod(route="/scoring/job/{job_id}/cancel", method="POST")
async def cancel_scoring_job(self, job_id: str) -> Optional[ScoringJob]:
"""Cancel a job.
:param job_id: The id of the job to cancel.
"""
...