Merge branch 'main' into eval_api_final

2025-03-17 17:00:30 -07:00 · 2025-03-17 17:00:30 -07:00 · 66cd83fb58
commit 66cd83fb58
parent 62abe2899a 5287b437ae
37 changed files with 1215 additions and 840 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2233,6 +2233,67 @@
            }
        },
        "/v1/datasetio/iterrows/{dataset_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/IterrowsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "DatasetIO"
+                ],
+                "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
+                "parameters": [
+                    {
+                        "name": "dataset_id",
+                        "in": "path",
+                        "description": "The ID of the dataset to get the rows from.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "start_index",
+                        "in": "query",
+                        "description": "Index into dataset for the first row to get. Get all rows if None.",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "The number of rows to get.",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
@ -6552,100 +6613,14 @@
                        "const": "factuality",
                        "default": "factuality"
                    },
-                    "factuality": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "factuality"
-                ],
-                "title": "FactualityGrader"
-            },
-            "FaithfulnessGrader": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "faithfulness",
-                        "default": "faithfulness"
-                    },
-                    "faithfulness": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "faithfulness"
-                ],
-                "title": "FaithfulnessGrader"
-            },
-            "Grader": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
+                    "dataset_id": {
                        "type": "string"
                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "grader",
-                        "default": "grader"
-                    },
-                    "grader": {
-                        "$ref": "#/components/schemas/GraderDefinition"
-                    },
-                    "description": {
-                        "type": "string"
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
                    },
                    "metadata": {
                        "type": "object",
@ -6679,98 +6654,163 @@
                    "provider_resource_id",
                    "provider_id",
                    "type",
-                    "grader",
+                    "dataset_id",
+                    "scoring_functions",
                    "metadata"
                ],
-                "title": "Grader"
+                "title": "Benchmark"
            },
-            "GraderDefinition": {
+            "DataSource": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/LlmGrader"
+                        "$ref": "#/components/schemas/URIDataSource"
                    },
                    {
-                        "$ref": "#/components/schemas/RegexParserGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/EqualityGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/SubsetOfGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FactualityGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FaithfulnessGrader"
+                        "$ref": "#/components/schemas/RowsDataSource"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
-                        "llm": "#/components/schemas/LlmGrader",
-                        "regex_parser": "#/components/schemas/RegexParserGrader",
-                        "equality": "#/components/schemas/EqualityGrader",
-                        "subset_of": "#/components/schemas/SubsetOfGrader",
-                        "factuality": "#/components/schemas/FactualityGrader",
-                        "faithfulness": "#/components/schemas/FaithfulnessGrader"
+                        "uri": "#/components/schemas/URIDataSource",
+                        "rows": "#/components/schemas/RowsDataSource"
                    }
                }
            },
-            "LlmGrader": {
+            "Grader": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "grader",
+                        "default": "grader"
+                    },
+                    "purpose": {
+                        "type": "string",
+                        "enum": [
+                            "post-training/messages",
+                            "eval/question-answer",
+                            "eval/messages-answer"
+                        ],
+                        "title": "DatasetPurpose",
+                        "description": "Purpose of the dataset. Each purpose has a required input data schema."
+                    },
+                    "source": {
+                        "$ref": "#/components/schemas/DataSource"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "purpose",
+                    "source",
+                    "metadata"
+                ],
+                "title": "Dataset"
+            },
+            "RowsDataSource": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
-                        "const": "llm",
-                        "default": "llm"
+                        "const": "rows",
+                        "default": "rows"
                    },
-                    "llm": {
-                        "type": "object",
-                        "properties": {
-                            "model": {
-                                "type": "string"
-                            },
-                            "prompt": {
-                                "type": "string"
-                            },
-                            "score_regexes": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            },
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
+                    "rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
                            }
                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "model",
-                            "prompt",
-                            "score_regexes",
-                            "aggregation_functions"
-                        ],
-                        "title": "LlmGraderParams"
+                        "description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
-                    "llm"
+                    "rows"
                ],
-                "title": "LlmGrader"
+                "title": "RowsDataSource",
+                "description": "A dataset stored in rows."
+            },
+            "URIDataSource": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "uri",
+                        "default": "uri"
+                    },
+                    "uri": {
+                        "type": "string",
+                        "description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\""
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "uri"
+                ],
+                "title": "URIDataSource",
+                "description": "A dataset that can be obtained from a URI."
            },
            "RegexParserGrader": {
                "type": "object",
@ -6819,45 +6859,182 @@
                ],
                "title": "RegexParserGrader"
            },
-            "SubsetOfGrader": {
+            "ModelType": {
+                "type": "string",
+                "enum": [
+                    "llm",
+                    "embedding"
+                ],
+                "title": "ModelType"
+            },
+            "AgentTurnInputType": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
-                        "const": "subset_of",
-                        "default": "subset_of"
-                    },
-                    "subset_of": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
+                        "const": "agent_turn_input",
+                        "default": "agent_turn_input"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "type",
-                    "subset_of"
+                    "type"
                ],
-                "title": "SubsetOfGrader"
+                "title": "AgentTurnInputType"
+            },
+            "ArrayType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "array",
+                        "default": "array"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ArrayType"
+            },
+            "BooleanType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "boolean",
+                        "default": "boolean"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "BooleanType"
+            },
+            "ChatCompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "chat_completion_input",
+                        "default": "chat_completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ChatCompletionInputType"
+            },
+            "CompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "completion_input",
+                        "default": "completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "CompletionInputType"
+            },
+            "JsonType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json",
+                        "default": "json"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "JsonType"
+            },
+            "NumberType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "number",
+                        "default": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "NumberType"
+            },
+            "ObjectType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "object",
+                        "default": "object"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ObjectType"
+            },
+            "ParamType": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/StringType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/NumberType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BooleanType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ArrayType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ObjectType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/JsonType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/UnionType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ChatCompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnInputType"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "string": "#/components/schemas/StringType",
+                        "number": "#/components/schemas/NumberType",
+                        "boolean": "#/components/schemas/BooleanType",
+                        "array": "#/components/schemas/ArrayType",
+                        "object": "#/components/schemas/ObjectType",
+                        "json": "#/components/schemas/JsonType",
+                        "union": "#/components/schemas/UnionType",
+                        "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
+                        "completion_input": "#/components/schemas/CompletionInputType",
+                        "agent_turn_input": "#/components/schemas/AgentTurnInputType"
+                    }
+                }
            },
            "Model": {
                "type": "object",
@ -6913,17 +7090,39 @@
                    "provider_id",
                    "type",
                    "metadata",
-                    "model_type"
+                    "return_type"
                ],
-                "title": "Model"
+                "title": "ScoringFn"
            },
-            "ModelType": {
-                "type": "string",
-                "enum": [
-                    "llm",
-                    "embedding"
+            "StringType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "string",
+                        "default": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
                ],
-                "title": "ModelType"
+                "title": "StringType"
+            },
+            "UnionType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "union",
+                        "default": "union"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "UnionType"
            },
            "Shield": {
                "type": "object",
@ -8131,7 +8330,7 @@
                        },
                        "description": "The rows in the current page."
                    },
-                    "next_index": {
+                    "next_start_index": {
                        "type": "integer",
                        "description": "Index into dataset for the first row in the next page. None if there are no more rows."
                    }
@ -9440,7 +9639,7 @@
                    },
                    "source": {
                        "$ref": "#/components/schemas/DataSource",
-                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
+                        "description": "The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
                    },
                    "metadata": {
                        "type": "object",
@ -9478,50 +9677,6 @@
                    "purpose",
                    "source"
                ],
-                "title": "RegisterDatasetRequest"
-            },
-            "RegisterGraderRequest": {
-                "type": "object",
-                "properties": {
-                    "grader": {
-                        "$ref": "#/components/schemas/GraderDefinition",
-                        "description": "The grader definition, E.g. - { \"type\": \"llm\", \"llm\": { \"model\": \"llama-405b\", \"prompt\": \"You are a judge. Score the answer based on the question. {question} {answer}\", } }"
-                    },
-                    "grader_id": {
-                        "type": "string",
-                        "description": "(Optional) The ID of the grader. If not provided, a random ID will be generated."
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "(Optional) Any additional metadata for this grader. - E.g. { \"description\": \"A grader that scores the answer based on the question.\", }"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "grader"
-                ],
                "title": "RegisterGraderRequest"
            },
            "RegisterModelRequest": {
@ -10199,9 +10354,6 @@
        {
            "name": "Files"
        },
-        {
-            "name": "Graders"
-        },
        {
            "name": "Inference",
            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@ -10254,9 +10406,8 @@
                "Benchmarks",
                "DatasetIO",
                "Datasets",
-                "Evaluation",
+                "Eval",
                "Files",
-                "Graders",
                "Inference",
                "Inspect",
                "Models",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1507,6 +1507,50 @@ paths:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
  /v1/datasetio/iterrows/{dataset_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IterrowsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - DatasetIO
+      description: >-
+        Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+      parameters:
+        - name: dataset_id
+          in: path
+          description: >-
+            The ID of the dataset to get the rows from.
+          required: true
+          schema:
+            type: string
+        - name: start_index
+          in: query
+          description: >-
+            Index into dataset for the first row to get. Get all rows if None.
+          required: false
+          schema:
+            type: integer
+        - name: limit
+          in: query
+          description: The number of rows to get.
+          required: false
+          schema:
+            type: integer
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -4527,255 +4571,6 @@ components:
      title: URIDataSource
      description: >-
        A dataset that can be obtained from a URI.
-    EqualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: equality
-          default: equality
-        equality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - equality
-      title: EqualityGrader
-    FactualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: factuality
-          default: factuality
-        factuality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - factuality
-      title: FactualityGrader
-    FaithfulnessGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: faithfulness
-          default: faithfulness
-        faithfulness:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - faithfulness
-      title: FaithfulnessGrader
-    Grader:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: grader
-          default: grader
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-        description:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - grader
-        - metadata
-      title: Grader
-    GraderDefinition:
-      oneOf:
-        - $ref: '#/components/schemas/LlmGrader'
-        - $ref: '#/components/schemas/RegexParserGrader'
-        - $ref: '#/components/schemas/EqualityGrader'
-        - $ref: '#/components/schemas/SubsetOfGrader'
-        - $ref: '#/components/schemas/FactualityGrader'
-        - $ref: '#/components/schemas/FaithfulnessGrader'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm: '#/components/schemas/LlmGrader'
-          regex_parser: '#/components/schemas/RegexParserGrader'
-          equality: '#/components/schemas/EqualityGrader'
-          subset_of: '#/components/schemas/SubsetOfGrader'
-          factuality: '#/components/schemas/FactualityGrader'
-          faithfulness: '#/components/schemas/FaithfulnessGrader'
-    LlmGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-        llm:
-          type: object
-          properties:
-            model:
-              type: string
-            prompt:
-              type: string
-            score_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - model
-            - prompt
-            - score_regexes
-            - aggregation_functions
-          title: LlmGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - llm
-      title: LlmGrader
-    RegexParserGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        regex_parser:
-          type: object
-          properties:
-            parsing_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - parsing_regexes
-            - aggregation_functions
-          title: RegexParserGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - regex_parser
-      title: RegexParserGrader
-    SubsetOfGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: subset_of
-          default: subset_of
-        subset_of:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - subset_of
-      title: SubsetOfGrader
    Model:
      type: object
      properties:
@ -4817,6 +4612,224 @@ components:
        - llm
        - embedding
      title: ModelType
+    AgentTurnInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent_turn_input
+          default: agent_turn_input
+      additionalProperties: false
+      required:
+        - type
+      title: AgentTurnInputType
+    ArrayType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: array
+          default: array
+      additionalProperties: false
+      required:
+        - type
+      title: ArrayType
+    BooleanType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: boolean
+          default: boolean
+      additionalProperties: false
+      required:
+        - type
+      title: BooleanType
+    ChatCompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: chat_completion_input
+          default: chat_completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: ChatCompletionInputType
+    CompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: completion_input
+          default: completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: CompletionInputType
+    JsonType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: rows
+          default: rows
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user",
+            "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
+            world!"}]} ]
+      additionalProperties: false
+      required:
+        - type
+        - rows
+      title: RowsDataSource
+      description: A dataset stored in rows.
+    URIDataSource:
+      type: object
+      properties:
+        type:
+          type: string
+          const: uri
+          default: uri
+        uri:
+          type: string
+          description: >-
+            The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl"
+            - "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}"
+      additionalProperties: false
+      required:
+        - type
+        - uri
+      title: URIDataSource
+      description: >-
+        A dataset that can be obtained from a URI.
+    EqualityGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: equality
+          default: equality
+        equality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+      title: ObjectType
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+        - $ref: '#/components/schemas/AgentTurnInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+          agent_turn_input: '#/components/schemas/AgentTurnInputType'
+    ScoringFn:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: scoring_function
+          default: scoring_function
+        description:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - grader
+        - metadata
+        - return_type
+      title: ScoringFn
+    StringType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: string
+          default: string
+      additionalProperties: false
+      required:
+        - type
+      title: StringType
+    UnionType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: union
+          default: union
+      additionalProperties: false
+      required:
+        - type
+      title: UnionType
    Shield:
      type: object
      properties:
@ -5580,7 +5593,7 @@ components:
                - type: array
                - type: object
          description: The rows in the current page.
-        next_index:
+        next_start_index:
          type: integer
          description: >-
            Index into dataset for the first row in the next page. None if there are
@ -6461,12 +6474,14 @@ components:
        source:
          $ref: '#/components/schemas/DataSource'
          description: >-
-            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
-            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "uri",
-            "uri": "data:csv;base64,{base64_content}" } - { "type": "uri", "uri":
-            "huggingface://llamastack/simpleqa?split=train" } - { "type": "rows",
-            "rows": [ { "messages": [ {"role": "user", "content": "Hello, world!"},
-            {"role": "assistant", "content": "Hello, world!"}, ] } ] }
+            The data source of the dataset. Ensure that the data source schema is
+            compatible with the purpose of the dataset. Examples: - { "type": "uri",
+            "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri":
+            "lsfs://mydata.jsonl" } - { "type": "uri", "uri": "data:csv;base64,{base64_content}"
+            } - { "type": "uri", "uri": "huggingface://llamastack/simpleqa?split=train"
+            } - { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content":
+            "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ]
+            } ] }
        metadata:
          type: object
          additionalProperties:
@ -6488,37 +6503,6 @@ components:
        - purpose
        - source
      title: RegisterDatasetRequest
-    RegisterGraderRequest:
-      type: object
-      properties:
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-          description: >-
-            The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b",
-            "prompt": "You are a judge. Score the answer based on the question. {question}
-            {answer}", } }
-        grader_id:
-          type: string
-          description: >-
-            (Optional) The ID of the grader. If not provided, a random ID will be
-            generated.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            (Optional) Any additional metadata for this grader. - E.g. { "description":
-            "A grader that scores the answer based on the question.", }
-      additionalProperties: false
-      required:
-        - grader
-      title: RegisterGraderRequest
    RegisterModelRequest:
      type: object
      properties:
@ -6951,9 +6935,10 @@ tags:
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
-  - name: Evaluation
+  - name: Eval
+    x-displayName: >-
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files
-  - name: Graders
  - name: Inference
    description: >-
      This API provides the raw interface to the underlying models. Two kinds of models
@ -6988,9 +6973,8 @@ x-tagGroups:
      - Benchmarks
      - DatasetIO
      - Datasets
-      - Evaluation
+      - Eval
      - Files
-      - Graders
      - Inference
      - Inspect
      - Models
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.


 Here are some example PRs to help you get started:
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -6,13 +6,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
+| datasetio | `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::nvidia` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| safety | `remote::nvidia` |
+| scoring | `inline::basic` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `inline::rag-runtime` |
 | vector_io | `inline::faiss` |


@ -20,8 +20,10 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
+- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
+- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

 ### Models

--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -6,17 +6,32 @@ The `llama-stack-client` CLI allows you to query information about the distribut

 ### `llama-stack-client`
 ```bash
-llama-stack-client -h
+llama-stack-client
+Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...

-usage: llama-stack-client [-h] {models,memory_banks,shields} ...
+  Welcome to the LlamaStackClient CLI

-Welcome to the LlamaStackClient CLI
+Options:
+  --version        Show the version and exit.
+  --endpoint TEXT  Llama Stack distribution endpoint
+  --api-key TEXT   Llama Stack distribution API key
+  --config TEXT    Path to config file
+  --help           Show this message and exit.

-options:
-  -h, --help            show this help message and exit
-
-subcommands:
-  {models,memory_banks,shields}
+Commands:
+  configure          Configure Llama Stack Client CLI.
+  datasets           Manage datasets.
+  eval               Run evaluation tasks.
+  eval_tasks         Manage evaluation tasks.
+  inference          Inference (chat).
+  inspect            Inspect server configuration.
+  models             Manage GenAI models.
+  post_training      Post-training.
+  providers          Manage API providers.
+  scoring_functions  Manage scoring functions.
+  shields            Manage safety shield services.
+  toolgroups         Manage available tool groups.
+  vector_dbs         Manage vector databases.
 ```

 ### `llama-stack-client configure`
@ -127,11 +142,11 @@ llama-stack-client vector_dbs list
 llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```

-Options:
- `--provider-id`: Optional. Provider ID for the vector db
- `--provider-vector-db-id`: Optional. Provider's vector db ID
- `--embedding-model`: Optional. Embedding model to use. Default: "all-MiniLM-L6-v2"
- `--embedding-dimension`: Optional. Dimension of embeddings. Default: 384
+Optional arguments:
+- `--provider-id`: Provider ID for the vector db
+- `--provider-vector-db-id`: Provider's vector db ID
+- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
+- `--embedding-dimension`: Dimension of embeddings. Default: 384

 ### `llama-stack-client vector_dbs unregister`
 ```bash
@ -157,11 +172,13 @@ llama-stack-client shields list
 llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
 ```

-Options:
- `--shield-id`: Required. ID of the shield
- `--provider-id`: Optional. Provider ID for the shield
- `--provider-shield-id`: Optional. Provider's shield ID
- `--params`: Optional. JSON configuration parameters for the shield
+Required arguments:
+- `--shield-id`: ID of the shield
+
+Optional arguments:
+- `--provider-id`: Provider ID for the shield
+- `--provider-shield-id`: Provider's shield ID
+- `--params`: JSON configuration parameters for the shield

 ## Eval Task Management

@ -175,13 +192,15 @@ llama-stack-client benchmarks list
 llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```

-Options:
- `--eval-task-id`: Required. ID of the eval task
- `--dataset-id`: Required. ID of the dataset to evaluate
- `--scoring-functions`: Required. One or more scoring functions to use for evaluation
- `--provider-id`: Optional. Provider ID for the eval task
- `--provider-eval-task-id`: Optional. Provider's eval task ID
- `--metadata`: Optional. Metadata for the eval task in JSON format
+Required arguments:
+- `--eval-task-id`: ID of the eval task
+- `--dataset-id`: ID of the dataset to evaluate
+- `--scoring-functions`: One or more scoring functions to use for evaluation
+
+Optional arguments:
+- `--provider-id`: Provider ID for the eval task
+- `--provider-eval-task-id`: Provider's eval task ID
+- `--metadata`: Metadata for the eval task in JSON format

 ## Eval execution
 ### `llama-stack-client eval run-benchmark`
@ -189,11 +208,13 @@ Options:
 llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

-Options:
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
- `--output-dir`: Required. Path to the directory where evaluation results will be saved
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
+Required arguments:
+- `--eval-task-config`: Path to the eval task config file in JSON format
+- `--output-dir`: Path to the directory where evaluation results will be saved
+
+Optional arguments:
+- `--num-examples`: Number of examples to evaluate (useful for debugging)
+- `--visualize`: If set, visualizes evaluation results after completion

 Example benchmark_config.json:
 ```json
@ -214,11 +235,13 @@ Example benchmark_config.json:
 llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

-Options:
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
- `--output-dir`: Required. Path to the directory where scoring results will be saved
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes scoring results after completion
+Required arguments:
+- `--eval-task-config`: Path to the eval task config file in JSON format
+- `--output-dir`: Path to the directory where scoring results will be saved
+
+Optional arguments:
+- `--num-examples`: Number of examples to evaluate (useful for debugging)
+- `--visualize`: If set, visualizes scoring results after completion

 ## Tool Group Management

@ -230,11 +253,11 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
-| builtin::code_interpreter | code-interpreter | None | None         |
+| builtin::code_interpreter | code-interpreter | None | None          |
 +---------------------------+------------------+------+---------------+
-| builtin::rag             | rag-runtime      | None | None         |
+| builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
-| builtin::websearch       | tavily-search    | None | None         |
+| builtin::websearch        | tavily-search    | None | None          |
 +---------------------------+------------------+------+---------------+
 ```

@ -250,11 +273,11 @@ Shows detailed information about a specific toolgroup. If the toolgroup is not f
 llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
 ```

-Options:
- `--provider-id`: Optional. Provider ID for the toolgroup
- `--provider-toolgroup-id`: Optional. Provider's toolgroup ID
- `--mcp-config`: Optional. JSON configuration for the MCP endpoint
- `--args`: Optional. JSON arguments for the toolgroup
+Optional arguments:
+- `--provider-id`: Provider ID for the toolgroup
+- `--provider-toolgroup-id`: Provider's toolgroup ID
+- `--mcp-config`: JSON configuration for the MCP endpoint
+- `--args`: JSON arguments for the toolgroup

 ### `llama-stack-client toolgroups unregister`
 ```bash