From b98497ee565baa878fad380bd2188d401370b533 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 18 Mar 2025 18:10:45 -0700
Subject: [PATCH] docs

---
 llama_stack/apis/benchmarks/benchmarks.py |  2 +-
 llama_stack/apis/evaluation/evaluation.py | 38 ++++++++++++-----------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 8017e5c27..3c5624e62 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -60,7 +60,7 @@ class Benchmarks(Protocol):
         metadata: Optional[Dict[str, Any]] = None,
     ) -> Benchmark:
         """
-        Register a new benchmark.
+        Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
 
         :param dataset_id: The ID of the dataset to be used to run the benchmark.
         :param grader_ids: List of grader ids to use for this benchmark.
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index e667acfd4..31e7f4909 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -119,12 +119,12 @@ class Evaluation(Protocol):
         candidate: EvaluationCandidate,
     ) -> EvaluationJob:
         """
-        Run an evaluation job.
+        Schedule a full evaluation job, by generating results using candidate and grading them.
 
         :param task: The task to evaluate. One of:
-         - BenchmarkTask: Run evaluation task against a benchmark_id
-         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
         :param candidate: The candidate to evaluate.
         """
         ...
@@ -136,25 +136,26 @@ class Evaluation(Protocol):
         candidate: EvaluationCandidate,
     ) -> EvaluationResponse:
         """
-        Run an evaluation job inline.
+        Run an evaluation synchronously, i.e., without scheduling a job".
+        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
 
         :param task: The task to evaluate. One of:
-         - BenchmarkTask: Run evaluation task against a benchmark_id
-         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+        - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+        - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
         :param candidate: The candidate to evaluate.
         """
         ...
 
-    @webmethod(route="/evaluation/grade", method="POST")
-    async def grade(self, task: EvaluationTask) -> EvaluationJob:
+    @webmethod(route="/evaluation/grading", method="POST")
+    async def grading(self, task: EvaluationTask) -> EvaluationJob:
         """
-        Run an grading job with generated results. Use this when you have generated results from inference in a dataset.
+        Schedule a grading job, by grading generated results. The generated results are expected to be in the dataset.
 
         :param task: The task to evaluate. One of:
-         - BenchmarkTask: Run evaluation task against a benchmark_id
-         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
 
         :return: The evaluation job containing grader scores.
         """
@@ -163,12 +164,13 @@ class Evaluation(Protocol):
     @webmethod(route="/evaluation/grade_sync", method="POST")
     async def grade_sync(self, task: EvaluationTask) -> EvaluationResponse:
         """
-        Run an grading job with generated results inline.
+        Run grading synchronously on generated results, i.e., without scheduling a job.
+        You should use this for quick testing, or when the number of rows is limited. Some implementations may have stricter restrictions on inputs which will be accepted.
 
         :param task: The task to evaluate. One of:
-         - BenchmarkTask: Run evaluation task against a benchmark_id
-         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
-         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+         - BenchmarkEvaluationTask: Run evaluation task against a benchmark_id
+         - DatasetEvaluationTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataEvaluationTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
 
         :return: The evaluation job containing grader scores. "generations" is not populated in the response.
         """