eval/scoring/datasetio doc

2025-08-12 04:50:39 +00:00 · 2025-03-04 14:54:43 -08:00 · 2025-03-04 14:54:43 -08:00 · 83d78cca9c
commit 83d78cca9c
parent c30cba9db2
5 changed files with 262 additions and 60 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -69,11 +69,12 @@
                "tags": [
                    "DatasetIO"
                ],
-                "description": "",
+                "description": "Get a paginated list of rows from a dataset.",
                "parameters": [
                    {
                        "name": "dataset_id",
                        "in": "query",
+                        "description": "The ID of the dataset to get the rows from.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -82,6 +83,7 @@
                    {
                        "name": "rows_in_page",
                        "in": "query",
+                        "description": "The number of rows to get per page.",
                        "required": true,
                        "schema": {
                            "type": "integer"
@ -90,6 +92,7 @@
                    {
                        "name": "page_token",
                        "in": "query",
+                        "description": "The token to get the next page of rows.",
                        "required": false,
                        "schema": {
                            "type": "string"
@ -98,6 +101,7 @@
                    {
                        "name": "filter_condition",
                        "in": "query",
+                        "description": "(Optional) A condition to filter the rows by.",
                        "required": false,
                        "schema": {
                            "type": "string"
@ -896,7 +900,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "EvaluateResponse object containing generations and scores",
                        "content": {
                            "application/json": {
                                "schema": {
@ -921,11 +925,12 @@
                "tags": [
                    "Eval"
                ],
-                "description": "",
+                "description": "Evaluate a list of rows on a benchmark.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2121,7 +2126,7 @@
            "get": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "The status of the evaluationjob.",
                        "content": {
                            "application/json": {
                                "schema": {
@ -2153,11 +2158,12 @@
                "tags": [
                    "Eval"
                ],
-                "description": "",
+                "description": "Get the status of a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2166,6 +2172,7 @@
                    {
                        "name": "job_id",
                        "in": "path",
+                        "description": "The ID of the job to get the status of.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2194,11 +2201,12 @@
                "tags": [
                    "Eval"
                ],
-                "description": "",
+                "description": "Cancel a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2207,6 +2215,7 @@
                    {
                        "name": "job_id",
                        "in": "path",
+                        "description": "The ID of the job to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2219,7 +2228,7 @@
            "get": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "The result of the job.",
                        "content": {
                            "application/json": {
                                "schema": {
@ -2244,11 +2253,12 @@
                "tags": [
                    "Eval"
                ],
-                "description": "",
+                "description": "Get the result of a job.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -2257,6 +2267,7 @@
                    {
                        "name": "job_id",
                        "in": "path",
+                        "description": "The ID of the job to get the result of.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -3287,7 +3298,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "The job that was created to run the evaluation.",
                        "content": {
                            "application/json": {
                                "schema": {
@ -3312,11 +3323,12 @@
                "tags": [
                    "Eval"
                ],
-                "description": "",
+                "description": "Run an evaluation on a benchmark.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -3418,7 +3430,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "ScoreResponse object containing rows and aggregated results",
                        "content": {
                            "application/json": {
                                "schema": {
@ -3443,7 +3455,7 @@
                "tags": [
                    "Scoring"
                ],
-                "description": "",
+                "description": "Score a list of rows.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -6187,7 +6199,8 @@
                        "default": "agent"
                    },
                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig"
+                        "$ref": "#/components/schemas/AgentConfig",
+                        "description": "The configuration for the agent candidate."
                    }
                },
                "additionalProperties": false,
@ -6195,7 +6208,8 @@
                    "type",
                    "config"
                ],
-                "title": "AgentCandidate"
+                "title": "AgentCandidate",
+                "description": "An agent candidate for evaluation."
            },
            "AggregationFunctionType": {
                "type": "string",
@ -6232,16 +6246,19 @@
                "type": "object",
                "properties": {
                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
+                        "$ref": "#/components/schemas/EvalCandidate",
+                        "description": "The candidate to evaluate."
                    },
                    "scoring_params": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
+                        },
+                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
                    },
                    "num_examples": {
-                        "type": "integer"
+                        "type": "integer",
+                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
                    }
                },
                "additionalProperties": false,
@ -6249,7 +6266,8 @@
                    "eval_candidate",
                    "scoring_params"
                ],
-                "title": "BenchmarkConfig"
+                "title": "BenchmarkConfig",
+                "description": "A benchmark configuration for evaluation."
            },
            "EvalCandidate": {
                "oneOf": [
@ -6311,13 +6329,16 @@
                        "default": "model"
                    },
                    "model": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The model ID to evaluate."
                    },
                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "The sampling parameters for the model."
                    },
                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage"
+                        "$ref": "#/components/schemas/SystemMessage",
+                        "description": "(Optional) The system message providing instructions or context to the model."
                    }
                },
                "additionalProperties": false,
@ -6326,7 +6347,8 @@
                    "model",
                    "sampling_params"
                ],
-                "title": "ModelCandidate"
+                "title": "ModelCandidate",
+                "description": "A model candidate for evaluation."
            },
            "RegexParserScoringFnParams": {
                "type": "object",
@ -6405,16 +6427,19 @@
                                    }
                                ]
                            }
-                        }
+                        },
+                        "description": "The rows to evaluate."
                    },
                    "scoring_functions": {
                        "type": "array",
                        "items": {
                            "type": "string"
-                        }
+                        },
+                        "description": "The scoring functions to use for the evaluation."
                    },
                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "description": "The configuration for the benchmark."
                    }
                },
                "additionalProperties": false,
@ -6454,13 +6479,15 @@
                                    }
                                ]
                            }
-                        }
+                        },
+                        "description": "The generations from the evaluation."
                    },
                    "scores": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
-                        }
+                        },
+                        "description": "The scores from the evaluation."
                    }
                },
                "additionalProperties": false,
@ -6468,7 +6495,8 @@
                    "generations",
                    "scores"
                ],
-                "title": "EvaluateResponse"
+                "title": "EvaluateResponse",
+                "description": "The response from an evaluation."
            },
            "ScoringResult": {
                "type": "object",
@ -6499,7 +6527,8 @@
                                    }
                                ]
                            }
-                        }
+                        },
+                        "description": "The scoring result for each row. Each row is a map of column name to value."
                    },
                    "aggregated_results": {
                        "type": "object",
@ -6524,7 +6553,8 @@
                                    "type": "object"
                                }
                            ]
-                        }
+                        },
+                        "description": "Map of metric name to aggregated value"
                    }
                },
                "additionalProperties": false,
@ -6532,7 +6562,8 @@
                    "score_rows",
                    "aggregated_results"
                ],
-                "title": "ScoringResult"
+                "title": "ScoringResult",
+                "description": "A scoring result for a single row."
            },
            "Session": {
                "type": "object",
@ -7021,13 +7052,16 @@
                                    }
                                ]
                            }
-                        }
+                        },
+                        "description": "The rows in the current page."
                    },
                    "total_count": {
-                        "type": "integer"
+                        "type": "integer",
+                        "description": "The total number of rows in the dataset."
                    },
                    "next_page_token": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The token to get the next page of rows."
                    }
                },
                "additionalProperties": false,
@ -7035,7 +7069,8 @@
                    "rows",
                    "total_count"
                ],
-                "title": "PaginatedRowsResult"
+                "title": "PaginatedRowsResult",
+                "description": "A paginated list of rows from a dataset."
            },
            "ScoringFn": {
                "type": "object",
@ -9307,7 +9342,8 @@
                "type": "object",
                "properties": {
                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "description": "The configuration for the benchmark."
                    }
                },
                "additionalProperties": false,
@ -9444,7 +9480,8 @@
                                    }
                                ]
                            }
-                        }
+                        },
+                        "description": "The rows to score."
                    },
                    "scoring_functions": {
                        "type": "object",
@ -9457,7 +9494,8 @@
                                    "type": "null"
                                }
                            ]
-                        }
+                        },
+                        "description": "The scoring functions to use for the scoring."
                    }
                },
                "additionalProperties": false,
@ -9474,14 +9512,16 @@
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
-                        }
+                        },
+                        "description": "A map of scoring function name to ScoringResult."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
-                "title": "ScoreResponse"
+                "title": "ScoreResponse",
+                "description": "The response from scoring."
            },
            "ScoreBatchRequest": {
                "type": "object",
@ -9896,7 +9936,8 @@
            "name": "Datasets"
        },
        {
-            "name": "Eval"
+            "name": "Eval",
+            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
        },
        {
            "name": "Files (Coming Soon)"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -31,25 +31,32 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - DatasetIO
-      description: ''
+      description: >-
+        Get a paginated list of rows from a dataset.
      parameters:
        - name: dataset_id
          in: query
+          description: >-
+            The ID of the dataset to get the rows from.
          required: true
          schema:
            type: string
        - name: rows_in_page
          in: query
+          description: The number of rows to get per page.
          required: true
          schema:
            type: integer
        - name: page_token
          in: query
+          description: The token to get the next page of rows.
          required: false
          schema:
            type: string
        - name: filter_condition
          in: query
+          description: >-
+            (Optional) A condition to filter the rows by.
          required: false
          schema:
            type: string
@ -613,7 +620,8 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            EvaluateResponse object containing generations and scores
          content:
            application/json:
              schema:
@ -630,10 +638,12 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: ''
+      description: Evaluate a list of rows on a benchmark.
      parameters:
        - name: benchmark_id
          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
@ -1417,7 +1427,7 @@ paths:
    get:
      responses:
        '200':
-          description: OK
+          description: The status of the evaluationjob.
          content:
            application/json:
              schema:
@ -1436,15 +1446,18 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: ''
+      description: Get the status of a job.
      parameters:
        - name: benchmark_id
          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
+          description: The ID of the job to get the status of.
          required: true
          schema:
            type: string
@ -1464,15 +1477,18 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: ''
+      description: Cancel a job.
      parameters:
        - name: benchmark_id
          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
+          description: The ID of the job to cancel.
          required: true
          schema:
            type: string
@ -1480,7 +1496,7 @@ paths:
    get:
      responses:
        '200':
-          description: OK
+          description: The result of the job.
          content:
            application/json:
              schema:
@ -1497,15 +1513,18 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: ''
+      description: Get the result of a job.
      parameters:
        - name: benchmark_id
          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
+          description: The ID of the job to get the result of.
          required: true
          schema:
            type: string
@ -2218,7 +2237,8 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            The job that was created to run the evaluation.
          content:
            application/json:
              schema:
@ -2235,10 +2255,12 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
-      description: ''
+      description: Run an evaluation on a benchmark.
      parameters:
        - name: benchmark_id
          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
@ -2306,7 +2328,8 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            ScoreResponse object containing rows and aggregated results
          content:
            application/json:
              schema:
@ -2323,7 +2346,7 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
-      description: ''
+      description: Score a list of rows.
      parameters: []
      requestBody:
        content:
@ -4290,11 +4313,14 @@ components:
          default: agent
        config:
          $ref: '#/components/schemas/AgentConfig'
+          description: >-
+            The configuration for the agent candidate.
      additionalProperties: false
      required:
        - type
        - config
      title: AgentCandidate
+      description: An agent candidate for evaluation.
    AggregationFunctionType:
      type: string
      enum:
@ -4323,17 +4349,26 @@ components:
      properties:
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
+          description: The candidate to evaluate.
        scoring_params:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringFnParams'
+          description: >-
+            Map between scoring function id and parameters for each scoring function
+            you want to run
        num_examples:
          type: integer
+          description: >-
+            (Optional) The number of examples to evaluate. If not provided, all examples
+            in the dataset will be evaluated
      additionalProperties: false
      required:
        - eval_candidate
        - scoring_params
      title: BenchmarkConfig
+      description: >-
+        A benchmark configuration for evaluation.
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
@ -4376,16 +4411,22 @@ components:
          default: model
        model:
          type: string
+          description: The model ID to evaluate.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
+          description: The sampling parameters for the model.
        system_message:
          $ref: '#/components/schemas/SystemMessage'
+          description: >-
+            (Optional) The system message providing instructions or context to the
+            model.
      additionalProperties: false
      required:
        - type
        - model
        - sampling_params
      title: ModelCandidate
+      description: A model candidate for evaluation.
    RegexParserScoringFnParams:
      type: object
      properties:
@ -4431,12 +4472,16 @@ components:
                - type: string
                - type: array
                - type: object
+          description: The rows to evaluate.
        scoring_functions:
          type: array
          items:
            type: string
+          description: >-
+            The scoring functions to use for the evaluation.
        benchmark_config:
          $ref: '#/components/schemas/BenchmarkConfig'
+          description: The configuration for the benchmark.
      additionalProperties: false
      required:
        - input_rows
@ -4458,15 +4503,18 @@ components:
                - type: string
                - type: array
                - type: object
+          description: The generations from the evaluation.
        scores:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
+          description: The scores from the evaluation.
      additionalProperties: false
      required:
        - generations
        - scores
      title: EvaluateResponse
+      description: The response from an evaluation.
    ScoringResult:
      type: object
      properties:
@ -4482,6 +4530,8 @@ components:
                - type: string
                - type: array
                - type: object
+          description: >-
+            The scoring result for each row. Each row is a map of column name to value.
        aggregated_results:
          type: object
          additionalProperties:
@ -4492,11 +4542,13 @@ components:
              - type: string
              - type: array
              - type: object
+          description: Map of metric name to aggregated value
      additionalProperties: false
      required:
        - score_rows
        - aggregated_results
      title: ScoringResult
+      description: A scoring result for a single row.
    Session:
      type: object
      properties:
@ -4809,15 +4861,19 @@ components:
                - type: string
                - type: array
                - type: object
+          description: The rows in the current page.
        total_count:
          type: integer
+          description: The total number of rows in the dataset.
        next_page_token:
          type: string
+          description: The token to get the next page of rows.
      additionalProperties: false
      required:
        - rows
        - total_count
      title: PaginatedRowsResult
+      description: A paginated list of rows from a dataset.
    ScoringFn:
      type: object
      properties:
@ -6248,6 +6304,7 @@ components:
      properties:
        benchmark_config:
          $ref: '#/components/schemas/BenchmarkConfig'
+          description: The configuration for the benchmark.
      additionalProperties: false
      required:
        - benchmark_config
@ -6329,12 +6386,15 @@ components:
                - type: string
                - type: array
                - type: object
+          description: The rows to score.
        scoring_functions:
          type: object
          additionalProperties:
            oneOf:
              - $ref: '#/components/schemas/ScoringFnParams'
              - type: 'null'
+          description: >-
+            The scoring functions to use for the scoring.
      additionalProperties: false
      required:
        - input_rows
@ -6347,10 +6407,13 @@ components:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            A map of scoring function name to ScoringResult.
      additionalProperties: false
      required:
        - results
      title: ScoreResponse
+      description: The response from scoring.
    ScoreBatchRequest:
      type: object
      properties:
@ -6621,6 +6684,8 @@ tags:
  - name: DatasetIO
  - name: Datasets
  - name: Eval
+    x-displayName: >-
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files (Coming Soon)
  - name: Inference
    description: >-
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -14,6 +14,14 @@ from llama_stack.schema_utils import json_schema_type, webmethod

@json_schema_type
 class PaginatedRowsResult(BaseModel):
+    """
+    A paginated list of rows from a dataset.
+
+    :param rows: The rows in the current page.
+    :param total_count: The total number of rows in the dataset.
+    :param next_page_token: The token to get the next page of rows.
+    """
+
    # the rows obey the DatasetSchema for the given dataset
    rows: List[Dict[str, Any]]
    total_count: int
@ -36,7 +44,15 @@ class DatasetIO(Protocol):
        rows_in_page: int,
        page_token: Optional[str] = None,
        filter_condition: Optional[str] = None,
-    ) -> PaginatedRowsResult: ...
+    ) -> PaginatedRowsResult:
+        """Get a paginated list of rows from a dataset.
+
+        :param dataset_id: The ID of the dataset to get the rows from.
+        :param rows_in_page: The number of rows to get per page.
+        :param page_token: The token to get the next page of rows.
+        :param filter_condition: (Optional) A condition to filter the rows by.
+        """
+        ...

    @webmethod(route="/datasetio/rows", method="POST")
    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -19,6 +19,13 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho

@json_schema_type
 class ModelCandidate(BaseModel):
+    """A model candidate for evaluation.
+
+    :param model: The model ID to evaluate.
+    :param sampling_params: The sampling parameters for the model.
+    :param system_message: (Optional) The system message providing instructions or context to the model.
+    """
+
    type: Literal["model"] = "model"
    model: str
    sampling_params: SamplingParams
@ -27,6 +34,11 @@ class ModelCandidate(BaseModel):

@json_schema_type
 class AgentCandidate(BaseModel):
+    """An agent candidate for evaluation.
+
+    :param config: The configuration for the agent candidate.
+    """
+
    type: Literal["agent"] = "agent"
    config: AgentConfig

@ -39,6 +51,13 @@ EvalCandidate = register_schema(

@json_schema_type
 class BenchmarkConfig(BaseModel):
+    """A benchmark configuration for evaluation.
+
+    :param eval_candidate: The candidate to evaluate.
+    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
+    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
+    """
+
    eval_candidate: EvalCandidate
    scoring_params: Dict[str, ScoringFnParams] = Field(
        description="Map between scoring function id and parameters for each scoring function you want to run",
@ -53,18 +72,32 @@ class BenchmarkConfig(BaseModel):

@json_schema_type
 class EvaluateResponse(BaseModel):
+    """The response from an evaluation.
+
+    :param generations: The generations from the evaluation.
+    :param scores: The scores from the evaluation.
+    """
+
    generations: List[Dict[str, Any]]
    # each key in the dict is a scoring function name
    scores: Dict[str, ScoringResult]


 class Eval(Protocol):
+    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""
+
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
    async def run_eval(
        self,
        benchmark_id: str,
        benchmark_config: BenchmarkConfig,
-    ) -> Job: ...
+    ) -> Job:
+        """Run an evaluation on a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param benchmark_config: The configuration for the benchmark.
+        :return: The job that was created to run the evaluation.
+        """

    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
    async def evaluate_rows(
@ -73,13 +106,40 @@ class Eval(Protocol):
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        benchmark_config: BenchmarkConfig,
-    ) -> EvaluateResponse: ...
+    ) -> EvaluateResponse:
+        """Evaluate a list of rows on a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param input_rows: The rows to evaluate.
+        :param scoring_functions: The scoring functions to use for the evaluation.
+        :param benchmark_config: The configuration for the benchmark.
+        :return: EvaluateResponse object containing generations and scores
+        """

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
+        """Get the status of a job.
+
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to get the status of.
+        :return: The status of the evaluationjob.
+        """
+        ...

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
+        """Cancel a job.
+
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to cancel.
+        """
+        ...

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        """Get the result of a job.
+
+        :param benchmark_id: The ID of the benchmark to run the evaluation on.
+        :param job_id: The ID of the job to get the result of.
+        :return: The result of the job.
+        """
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -17,6 +17,13 @@ ScoringResultRow = Dict[str, Any]

@json_schema_type
 class ScoringResult(BaseModel):
+    """
+    A scoring result for a single row.
+
+    :param score_rows: The scoring result for each row. Each row is a map of column name to value.
+    :param aggregated_results: Map of metric name to aggregated value
+    """
+
    score_rows: List[ScoringResultRow]
    # aggregated metrics to value
    aggregated_results: Dict[str, Any]
@ -30,6 +37,12 @@ class ScoreBatchResponse(BaseModel):

@json_schema_type
 class ScoreResponse(BaseModel):
+    """
+    The response from scoring.
+
+    :param results: A map of scoring function name to ScoringResult.
+    """
+
    # each key in the dict is a scoring function name
    results: Dict[str, ScoringResult]

@ -55,4 +68,11 @@ class Scoring(Protocol):
        self,
        input_rows: List[Dict[str, Any]],
        scoring_functions: Dict[str, Optional[ScoringFnParams]],
-    ) -> ScoreResponse: ...
+    ) -> ScoreResponse:
+        """Score a list of rows.
+
+        :param input_rows: The rows to score.
+        :param scoring_functions: The scoring functions to use for the scoring.
+        :return: ScoreResponse object containing rows and aggregated results
+        """
+        ...