comments

2025-03-18 15:01:41 -07:00 · 2025-03-18 15:01:41 -07:00 · a69759613a
commit a69759613a
parent a8b0467ec3
7 changed files with 2486 additions and 389 deletions
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -62,7 +62,7 @@ class Benchmarks(Protocol):
        """
        Register a new benchmark.

-        :param dataset_id: The ID of the dataset to used to run the benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark.
        :param grader_ids: List of grader ids to use for this benchmark.
        :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
        :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
@ -87,3 +87,10 @@ class Benchmarks(Protocol):
        :param benchmark_id: The ID of the benchmark to get.
        """
        ...
+
+    @webmethod(route="/benchmarks/{benchmark_id}", method="DELETE")
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """
+        Unregister a benchmark by ID.
+        """
+        ...
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.
 from datetime import datetime
 from enum import Enum
-from typing import Optional

 from pydantic import BaseModel

@ -38,12 +37,12 @@ class CommonJobFields(BaseModel):
    :param id: The ID of the job.
    :param status: The status of the job.
    :param created_at: The time the job was created.
-    :param ended_at: The time the job ended.
+    :param completed_at: The time the job completed.
    :param error: If status of the job is failed, this will contain the error message.
    """

    id: str
    status: JobStatus
    created_at: datetime
-    ended_at: Optional[datetime] = None
-    error: Optional[str] = None
+    completed_at: datetime | None = None
+    error: str | None = None
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -48,28 +48,28 @@ EvaluationCandidate = register_schema(


@json_schema_type
-class BenchmarkTask(BaseModel):
-    type: Literal["benchmark_id"] = "benchmark_id"
+class BenchmarkEvaluationTask(BaseModel):
+    type: Literal["benchmark"] = "benchmark"
    benchmark_id: str


@json_schema_type
-class DatasetGraderTask(BaseModel):
-    type: Literal["dataset_grader"] = "dataset_grader"
+class DatasetEvaluationTask(BaseModel):
+    type: Literal["dataset"] = "dataset"
    dataset_id: str
    grader_ids: List[str]


@json_schema_type
-class DataSourceGraderTask(BaseModel):
-    type: Literal["data_source_grader"] = "data_source_grader"
+class DataEvaluationTask(BaseModel):
+    type: Literal["data"] = "data"
    data_source: DataSource
    grader_ids: List[str]


 EvaluationTask = register_schema(
    Annotated[
-        Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+        Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
        Field(discriminator="type"),
    ],
    name="EvaluationTask",
--- a/llama_stack/apis/graders/graders.py
+++ b/llama_stack/apis/graders/graders.py
@ -29,6 +29,13 @@ from .graders import *  # noqa: F401 F403
 class GraderType(Enum):
    """
    A type of grader. Each type is a criteria for evaluating answers.
+
+    :cvar llm: Use an LLM to score the answer.
+    :cvar regex_parser: Use a regex parser to score the answer.
+    :cvar equality: Check if the answer is equal to the reference answer.
+    :cvar subset_of: Check if the answer is a subset of the reference answer.
+    :cvar factuality: Check if the answer is factually correct using LLM as judge.
+    :cvar faithfulness: Check if the answer is faithful to the reference answer using LLM as judge.
    """

    llm = "llm"
@ -221,9 +228,9 @@ class Graders(Protocol):
        ...

    @webmethod(route="/graders/{grader_id:path}", method="DELETE")
-    async def delete_grader(self, grader_id: str) -> None:
+    async def unregister_grader(self, grader_id: str) -> None:
        """
-        Delete a grader by ID.
+        Unregister a grader by ID.
        :param grader_id: The ID of the grader.
        """
        ...