Merge branch 'main' into eval_api_final

2025-03-17 17:00:30 -07:00 · 2025-03-17 17:00:30 -07:00 · 66cd83fb58
commit 66cd83fb58
parent 62abe2899a 5287b437ae
37 changed files with 1215 additions and 840 deletions
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1507,6 +1507,50 @@ paths:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
  /v1/datasetio/iterrows/{dataset_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IterrowsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - DatasetIO
+      description: >-
+        Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+      parameters:
+        - name: dataset_id
+          in: path
+          description: >-
+            The ID of the dataset to get the rows from.
+          required: true
+          schema:
+            type: string
+        - name: start_index
+          in: query
+          description: >-
+            Index into dataset for the first row to get. Get all rows if None.
+          required: false
+          schema:
+            type: integer
+        - name: limit
+          in: query
+          description: The number of rows to get.
+          required: false
+          schema:
+            type: integer
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -4527,255 +4571,6 @@ components:
      title: URIDataSource
      description: >-
        A dataset that can be obtained from a URI.
-    EqualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: equality
-          default: equality
-        equality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - equality
-      title: EqualityGrader
-    FactualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: factuality
-          default: factuality
-        factuality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - factuality
-      title: FactualityGrader
-    FaithfulnessGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: faithfulness
-          default: faithfulness
-        faithfulness:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - faithfulness
-      title: FaithfulnessGrader
-    Grader:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: grader
-          default: grader
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-        description:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - grader
-        - metadata
-      title: Grader
-    GraderDefinition:
-      oneOf:
-        - $ref: '#/components/schemas/LlmGrader'
-        - $ref: '#/components/schemas/RegexParserGrader'
-        - $ref: '#/components/schemas/EqualityGrader'
-        - $ref: '#/components/schemas/SubsetOfGrader'
-        - $ref: '#/components/schemas/FactualityGrader'
-        - $ref: '#/components/schemas/FaithfulnessGrader'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm: '#/components/schemas/LlmGrader'
-          regex_parser: '#/components/schemas/RegexParserGrader'
-          equality: '#/components/schemas/EqualityGrader'
-          subset_of: '#/components/schemas/SubsetOfGrader'
-          factuality: '#/components/schemas/FactualityGrader'
-          faithfulness: '#/components/schemas/FaithfulnessGrader'
-    LlmGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-        llm:
-          type: object
-          properties:
-            model:
-              type: string
-            prompt:
-              type: string
-            score_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - model
-            - prompt
-            - score_regexes
-            - aggregation_functions
-          title: LlmGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - llm
-      title: LlmGrader
-    RegexParserGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        regex_parser:
-          type: object
-          properties:
-            parsing_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - parsing_regexes
-            - aggregation_functions
-          title: RegexParserGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - regex_parser
-      title: RegexParserGrader
-    SubsetOfGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: subset_of
-          default: subset_of
-        subset_of:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - subset_of
-      title: SubsetOfGrader
    Model:
      type: object
      properties:
@ -4817,6 +4612,224 @@ components:
        - llm
        - embedding
      title: ModelType
+    AgentTurnInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent_turn_input
+          default: agent_turn_input
+      additionalProperties: false
+      required:
+        - type
+      title: AgentTurnInputType
+    ArrayType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: array
+          default: array
+      additionalProperties: false
+      required:
+        - type
+      title: ArrayType
+    BooleanType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: boolean
+          default: boolean
+      additionalProperties: false
+      required:
+        - type
+      title: BooleanType
+    ChatCompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: chat_completion_input
+          default: chat_completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: ChatCompletionInputType
+    CompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: completion_input
+          default: completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: CompletionInputType
+    JsonType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: rows
+          default: rows
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user",
+            "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
+            world!"}]} ]
+      additionalProperties: false
+      required:
+        - type
+        - rows
+      title: RowsDataSource
+      description: A dataset stored in rows.
+    URIDataSource:
+      type: object
+      properties:
+        type:
+          type: string
+          const: uri
+          default: uri
+        uri:
+          type: string
+          description: >-
+            The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl"
+            - "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}"
+      additionalProperties: false
+      required:
+        - type
+        - uri
+      title: URIDataSource
+      description: >-
+        A dataset that can be obtained from a URI.
+    EqualityGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: equality
+          default: equality
+        equality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+      title: ObjectType
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+        - $ref: '#/components/schemas/AgentTurnInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+          agent_turn_input: '#/components/schemas/AgentTurnInputType'
+    ScoringFn:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: scoring_function
+          default: scoring_function
+        description:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - grader
+        - metadata
+        - return_type
+      title: ScoringFn
+    StringType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: string
+          default: string
+      additionalProperties: false
+      required:
+        - type
+      title: StringType
+    UnionType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: union
+          default: union
+      additionalProperties: false
+      required:
+        - type
+      title: UnionType
    Shield:
      type: object
      properties:
@ -5580,7 +5593,7 @@ components:
                - type: array
                - type: object
          description: The rows in the current page.
-        next_index:
+        next_start_index:
          type: integer
          description: >-
            Index into dataset for the first row in the next page. None if there are
@ -6461,12 +6474,14 @@ components:
        source:
          $ref: '#/components/schemas/DataSource'
          description: >-
-            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
-            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "uri",
-            "uri": "data:csv;base64,{base64_content}" } - { "type": "uri", "uri":
-            "huggingface://llamastack/simpleqa?split=train" } - { "type": "rows",
-            "rows": [ { "messages": [ {"role": "user", "content": "Hello, world!"},
-            {"role": "assistant", "content": "Hello, world!"}, ] } ] }
+            The data source of the dataset. Ensure that the data source schema is
+            compatible with the purpose of the dataset. Examples: - { "type": "uri",
+            "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri":
+            "lsfs://mydata.jsonl" } - { "type": "uri", "uri": "data:csv;base64,{base64_content}"
+            } - { "type": "uri", "uri": "huggingface://llamastack/simpleqa?split=train"
+            } - { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content":
+            "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ]
+            } ] }
        metadata:
          type: object
          additionalProperties:
@ -6488,37 +6503,6 @@ components:
        - purpose
        - source
      title: RegisterDatasetRequest
-    RegisterGraderRequest:
-      type: object
-      properties:
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-          description: >-
-            The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b",
-            "prompt": "You are a judge. Score the answer based on the question. {question}
-            {answer}", } }
-        grader_id:
-          type: string
-          description: >-
-            (Optional) The ID of the grader. If not provided, a random ID will be
-            generated.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            (Optional) Any additional metadata for this grader. - E.g. { "description":
-            "A grader that scores the answer based on the question.", }
-      additionalProperties: false
-      required:
-        - grader
-      title: RegisterGraderRequest
    RegisterModelRequest:
      type: object
      properties:
@ -6951,9 +6935,10 @@ tags:
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
-  - name: Evaluation
+  - name: Eval
+    x-displayName: >-
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files
-  - name: Graders
  - name: Inference
    description: >-
      This API provides the raw interface to the underlying models. Two kinds of models
@ -6988,9 +6973,8 @@ x-tagGroups:
      - Benchmarks
      - DatasetIO
      - Datasets
-      - Evaluation
+      - Eval
      - Files
-      - Graders
      - Inference
      - Inspect
      - Models