Merge branch 'eval_api_final' into delete_eval_scoring_scoring_fn

2025-12-31 11:50:01 +00:00 · 2025-03-19 09:50:40 -07:00 · 2025-03-19 09:50:40 -07:00 · e23531c9d0
commit e23531c9d0
parent aaa5974dce 0048274ec0
4 changed files with 50 additions and 38 deletions
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -62,8 +62,8 @@ class Benchmarks(Protocol):
        """
        Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.

-        :param dataset_id: The ID of the dataset to be used to run the benchmark.
-        :param grader_ids: List of grader ids to use for this benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
+        :param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
        :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
        :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
        """
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
    candidate: EvaluationCandidate


-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
-    :param metrics: Map of metric name to aggregated value.
-    """
-
-    scores: List[Dict[str, Any]]
-    metrics: Dict[str, Any]
-
-
@json_schema_type
 class EvaluationResponse(BaseModel):
    """
    A response to an inline evaluation.

-    :param generations: The generations in rows for the evaluation.
-    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    :param result_rows: The result data containing inputs, generations and grades in each row.
+    :param grades: Map of grader id to aggregated value.
    """

-    generations: List[Dict[str, Any]]
-    scores: Dict[str, ScoringResult]
+    result_rows: List[Dict[str, Any]]
+    grades: Dict[str, Any]


 class Evaluation(Protocol):