Merge branch 'pr1573' into api_2

2025-12-31 01:30:00 +00:00 · 2025-03-13 14:49:04 -07:00 · 2025-03-13 14:49:04 -07:00 · 0c37951395
commit 0c37951395
parent 025d173606 a6095820af
4 changed files with 246 additions and 354 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -40,75 +40,7 @@
        }
    ],
    "paths": {
-        "/v1/datasetio/rows": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/PaginatedRowsResult"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "DatasetIO"
-                ],
-                "description": "Get a paginated list of rows from a dataset.",
-                "parameters": [
-                    {
-                        "name": "dataset_id",
-                        "in": "query",
-                        "description": "The ID of the dataset to get the rows from.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "rows_in_page",
-                        "in": "query",
-                        "description": "The number of rows to get per page.",
-                        "required": true,
-                        "schema": {
-                            "type": "integer"
-                        }
-                    },
-                    {
-                        "name": "page_token",
-                        "in": "query",
-                        "description": "The token to get the next page of rows.",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "filter_condition",
-                        "in": "query",
-                        "description": "(Optional) A condition to filter the rows by.",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
+        "/v1/datasets/{dataset_id}/rows": {
            "post": {
                "responses": {
                    "200": {
@ -131,7 +63,16 @@
                    "DatasetIO"
                ],
                "description": "",
-                "parameters": [],
+                "parameters": [
+                    {
+                        "name": "dataset_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
                "requestBody": {
                    "content": {
                        "application/json": {
@ -2272,6 +2213,76 @@
                }
            }
        },
+        "/v1/datasets/{dataset_id}/iterrows": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/PaginatedRowsResult"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "DatasetIO"
+                ],
+                "description": "Get a paginated list of rows from a dataset.",
+                "parameters": [
+                    {
+                        "name": "dataset_id",
+                        "in": "path",
+                        "description": "The ID of the dataset to get the rows from.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "rows_in_page",
+                        "in": "query",
+                        "description": "The number of rows to get per page.",
+                        "required": true,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    },
+                    {
+                        "name": "page_token",
+                        "in": "query",
+                        "description": "The token to get the next page of rows.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "filter_condition",
+                        "in": "query",
+                        "description": "(Optional) A condition to filter the rows by.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
@ -3861,9 +3872,6 @@
            "AppendRowsRequest": {
                "type": "object",
                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
                    "rows": {
                        "type": "array",
                        "items": {
@ -3895,7 +3903,6 @@
                },
                "additionalProperties": false,
                "required": [
-                    "dataset_id",
                    "rows"
                ],
                "title": "AppendRowsRequest"
@ -6755,9 +6762,6 @@
                    {
                        "$ref": "#/components/schemas/URIDataSource"
                    },
-                    {
-                        "$ref": "#/components/schemas/HuggingfaceDataSource"
-                    },
                    {
                        "$ref": "#/components/schemas/RowsDataSource"
                    }
@ -6766,7 +6770,6 @@
                    "propertyName": "type",
                    "mapping": {
                        "uri": "#/components/schemas/URIDataSource",
-                        "huggingface": "#/components/schemas/HuggingfaceDataSource",
                        "rows": "#/components/schemas/RowsDataSource"
                    }
                }
@ -6842,65 +6845,6 @@
                ],
                "title": "Dataset"
            },
-            "HuggingfaceDataSource": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "huggingface",
-                        "default": "huggingface",
-                        "description": "The type of the data source."
-                    },
-                    "huggingface": {
-                        "type": "object",
-                        "properties": {
-                            "path": {
-                                "type": "string",
-                                "description": "The path to the dataset in Huggingface. E.g. - \"llamastack/simpleqa\""
-                            },
-                            "params": {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "oneOf": [
-                                        {
-                                            "type": "null"
-                                        },
-                                        {
-                                            "type": "boolean"
-                                        },
-                                        {
-                                            "type": "number"
-                                        },
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "type": "array"
-                                        },
-                                        {
-                                            "type": "object"
-                                        }
-                                    ]
-                                },
-                                "description": "The parameters for the dataset."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "path",
-                            "params"
-                        ],
-                        "description": "The fields for a Huggingface dataset."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "huggingface"
-                ],
-                "title": "HuggingfaceDataSource",
-                "description": "A dataset stored in Huggingface."
-            },
            "RowsDataSource": {
                "type": "object",
                "properties": {
@ -7034,56 +6978,7 @@
                ],
                "title": "ModelType"
            },
-            "PaginatedRowsResult": {
-                "type": "object",
-                "properties": {
-                    "rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The rows in the current page."
-                    },
-                    "total_count": {
-                        "type": "integer",
-                        "description": "The total number of rows in the dataset."
-                    },
-                    "next_page_token": {
-                        "type": "string",
-                        "description": "The token to get the next page of rows."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "rows",
-                    "total_count"
-                ],
-                "title": "PaginatedRowsResult",
-                "description": "A paginated list of rows from a dataset."
-            },
-            "AnswerCorrectnessScoringFn": {
+            "AgentTurnInputType": {
                "type": "object",
                "properties": {
                    "type": {
@ -8537,6 +8432,55 @@
                ],
                "title": "ToolInvocationResult"
            },
+            "PaginatedRowsResult": {
+                "type": "object",
+                "properties": {
+                    "rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The rows in the current page."
+                    },
+                    "total_count": {
+                        "type": "integer",
+                        "description": "The total number of rows in the dataset."
+                    },
+                    "next_page_token": {
+                        "type": "string",
+                        "description": "The token to get the next page of rows."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "rows",
+                    "total_count"
+                ],
+                "title": "PaginatedRowsResult",
+                "description": "A paginated list of rows from a dataset."
+            },
            "ListAgentSessionsResponse": {
                "type": "object",
                "properties": {
@ -9884,7 +9828,7 @@
                    },
                    "source": {
                        "$ref": "#/components/schemas/DataSource",
-                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"huggingface\", \"huggingface\": { \"dataset_path\": \"tatsu-lab/alpaca\", \"params\": { \"split\": \"train\" } } } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
+                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
                    },
                    "metadata": {
                        "type": "object",
@ -9914,7 +9858,7 @@
                    },
                    "dataset_id": {
                        "type": "string",
-                        "description": "The ID of the dataset. If not provided, a random ID will be generated."
+                        "description": "The ID of the dataset. If not provided, an ID will be generated."
                    }
                },
                "additionalProperties": false,
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -10,56 +10,7 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
-  /v1/datasetio/rows:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedRowsResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - DatasetIO
-      description: >-
-        Get a paginated list of rows from a dataset.
-      parameters:
-        - name: dataset_id
-          in: query
-          description: >-
-            The ID of the dataset to get the rows from.
-          required: true
-          schema:
-            type: string
-        - name: rows_in_page
-          in: query
-          description: The number of rows to get per page.
-          required: true
-          schema:
-            type: integer
-        - name: page_token
-          in: query
-          description: The token to get the next page of rows.
-          required: false
-          schema:
-            type: string
-        - name: filter_condition
-          in: query
-          description: >-
-            (Optional) A condition to filter the rows by.
-          required: false
-          schema:
-            type: string
+  /v1/datasets/{dataset_id}/rows:
    post:
      responses:
        '200':
@ -77,7 +28,12 @@ paths:
      tags:
        - DatasetIO
      description: ''
-      parameters: []
+      parameters:
+        - name: dataset_id
+          in: path
+          required: true
+          schema:
+            type: string
      requestBody:
        content:
          application/json:
@ -1529,6 +1485,56 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
+  /v1/datasets/{dataset_id}/iterrows:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/PaginatedRowsResult'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - DatasetIO
+      description: >-
+        Get a paginated list of rows from a dataset.
+      parameters:
+        - name: dataset_id
+          in: path
+          description: >-
+            The ID of the dataset to get the rows from.
+          required: true
+          schema:
+            type: string
+        - name: rows_in_page
+          in: query
+          description: The number of rows to get per page.
+          required: true
+          schema:
+            type: integer
+        - name: page_token
+          in: query
+          description: The token to get the next page of rows.
+          required: false
+          schema:
+            type: string
+        - name: filter_condition
+          in: query
+          description: >-
+            (Optional) A condition to filter the rows by.
+          required: false
+          schema:
+            type: string
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
@ -2636,8 +2642,6 @@ components:
    AppendRowsRequest:
      type: object
      properties:
-        dataset_id:
-          type: string
        rows:
          type: array
          items:
@ -2652,7 +2656,6 @@ components:
                - type: object
      additionalProperties: false
      required:
-        - dataset_id
        - rows
      title: AppendRowsRequest
    CompletionMessage:
@ -4679,13 +4682,11 @@ components:
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
-        - $ref: '#/components/schemas/HuggingfaceDataSource'
        - $ref: '#/components/schemas/RowsDataSource'
      discriminator:
        propertyName: type
        mapping:
          uri: '#/components/schemas/URIDataSource'
-          huggingface: '#/components/schemas/HuggingfaceDataSource'
          rows: '#/components/schemas/RowsDataSource'
    Dataset:
      type: object
@ -4734,43 +4735,6 @@ components:
        - source
        - metadata
      title: Dataset
-    HuggingfaceDataSource:
-      type: object
-      properties:
-        type:
-          type: string
-          const: huggingface
-          default: huggingface
-          description: The type of the data source.
-        huggingface:
-          type: object
-          properties:
-            path:
-              type: string
-              description: >-
-                The path to the dataset in Huggingface. E.g. - "llamastack/simpleqa"
-            params:
-              type: object
-              additionalProperties:
-                oneOf:
-                  - type: 'null'
-                  - type: boolean
-                  - type: number
-                  - type: string
-                  - type: array
-                  - type: object
-              description: The parameters for the dataset.
-          additionalProperties: false
-          required:
-            - path
-            - params
-          description: The fields for a Huggingface dataset.
-      additionalProperties: false
-      required:
-        - type
-        - huggingface
-      title: HuggingfaceDataSource
-      description: A dataset stored in Huggingface.
    RowsDataSource:
      type: object
      properties:
@ -4860,35 +4824,7 @@ components:
        - llm
        - embedding
      title: ModelType
-    PaginatedRowsResult:
-      type: object
-      properties:
-        rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows in the current page.
-        total_count:
-          type: integer
-          description: The total number of rows in the dataset.
-        next_page_token:
-          type: string
-          description: The token to get the next page of rows.
-      additionalProperties: false
-      required:
-        - rows
-        - total_count
-      title: PaginatedRowsResult
-      description: A paginated list of rows from a dataset.
-    AnswerCorrectnessScoringFn:
+    AgentTurnInputType:
      type: object
      properties:
        type:
@ -5883,6 +5819,34 @@ components:
      required:
        - content
      title: ToolInvocationResult
+    PaginatedRowsResult:
+      type: object
+      properties:
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: The rows in the current page.
+        total_count:
+          type: integer
+          description: The total number of rows in the dataset.
+        next_page_token:
+          type: string
+          description: The token to get the next page of rows.
+      additionalProperties: false
+      required:
+        - rows
+        - total_count
+      title: PaginatedRowsResult
+      description: A paginated list of rows from a dataset.
    ListAgentSessionsResponse:
      type: object
      properties:
@ -6804,11 +6768,11 @@ components:
          $ref: '#/components/schemas/DataSource'
          description: >-
            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
-            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "huggingface",
-            "huggingface": { "dataset_path": "tatsu-lab/alpaca", "params": { "split":
-            "train" } } } - { "type": "rows", "rows": [ { "messages": [ {"role": "user",
-            "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
-            world!"}, ] } ] }
+            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "uri",
+            "uri": "data:csv;base64,{base64_content}" } - { "type": "uri", "uri":
+            "huggingface://llamastack/simpleqa?split=train" } - { "type": "rows",
+            "rows": [ { "messages": [ {"role": "user", "content": "Hello, world!"},
+            {"role": "assistant", "content": "Hello, world!"}, ] } ] }
        metadata:
          type: object
          additionalProperties:
@ -6824,7 +6788,7 @@ components:
        dataset_id:
          type: string
          description: >-
-            The ID of the dataset. If not provided, a random ID will be generated.
+            The ID of the dataset. If not provided, an ID will be generated.
      additionalProperties: false
      required:
        - purpose
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -37,8 +37,8 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/rows", method="GET")
-    async def get_rows_paginated(
+    @webmethod(route="/datasets/{dataset_id}/iterrows", method="GET")
+    async def iterrows(
        self,
        dataset_id: str,
        rows_in_page: int,
@ -54,5 +54,7 @@ class DatasetIO(Protocol):
        """
        ...

-    @webmethod(route="/datasetio/rows", method="POST")
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
+    @webmethod(route="/datasets/{dataset_id}/rows", method="POST")
+    async def append_rows(
+        self, dataset_id: str, rows: List[Dict[str, Any]]
+    ) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -60,6 +60,7 @@ class DatasetPurpose(Enum):
            "answer": "Paris"
        }
    """
+
    post_training_messages = "post-training/messages"

    eval_question_answer = "eval/question-answer"
@ -75,11 +76,10 @@ class DatasetPurpose(Enum):
 class DatasetType(Enum):
    """
    Type of the dataset source.
-    :cvar huggingface: The dataset is stored in Huggingface.
-    :cvar uri: The dataset can be obtained from a URI. 
-    :cvar rows: The dataset is stored in rows. 
+    :cvar uri: The dataset can be obtained from a URI.
+    :cvar rows: The dataset is stored in rows.
    """
-    huggingface = "huggingface"
+
    uri = "uri"
    rows = "rows"

@ -92,30 +92,11 @@ class URIDataSource(BaseModel):
        - "lsfs://mydata.jsonl"
        - "data:csv;base64,{base64_content}"
    """
+
    type: Literal["uri"] = "uri"
    uri: str


-class HuggingfaceDataSourceFields(BaseModel):
-    """The fields for a Huggingface dataset.
-    :param path: The path to the dataset in Huggingface. E.g.
-        - "llamastack/simpleqa"
-    :param params: The parameters for the dataset.
-    """
-    path: str
-    params: Dict[str, Any]
-
-
-@json_schema_type
-class HuggingfaceDataSource(BaseModel):
-    """A dataset stored in Huggingface.
-    :param type: The type of the data source.
-    :param huggingface: The fields for a Huggingface dataset.
-    """
-    type: Literal["huggingface"] = "huggingface"
-    huggingface: HuggingfaceDataSourceFields
-
-
@json_schema_type
 class RowsDataSource(BaseModel):
    """A dataset stored in rows.
@ -124,13 +105,14 @@ class RowsDataSource(BaseModel):
            {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
        ]
    """
+
    type: Literal["rows"] = "rows"
    rows: List[Dict[str, Any]]


 DataSource = register_schema(
    Annotated[
-        Union[URIDataSource, HuggingfaceDataSource, RowsDataSource],
+        Union[URIDataSource, RowsDataSource],
        Field(discriminator="type"),
    ],
    name="DataSource",
@ -141,6 +123,7 @@ class CommonDatasetFields(BaseModel):
    """
    Common fields for a dataset.
    """
+
    purpose: DatasetPurpose
    source: DataSource
    metadata: Dict[str, Any] = Field(
@ -237,13 +220,12 @@ class Datasets(Protocol):
               "uri": "lsfs://mydata.jsonl"
           }
           - {
-               "type": "huggingface",
-               "huggingface": {
-                   "dataset_path": "tatsu-lab/alpaca",
-                   "params": {
-                       "split": "train"
-                   }
-               }
+               "type": "uri",
+               "uri": "data:csv;base64,{base64_content}"
+           }
+           - {
+               "type": "uri",
+               "uri": "huggingface://llamastack/simpleqa?split=train"
           }
           - {
               "type": "rows",
@ -258,7 +240,7 @@ class Datasets(Protocol):
           }
        :param metadata: The metadata for the dataset.
           - E.g. {"description": "My dataset"}
-        :param dataset_id: The ID of the dataset. If not provided, a random ID will be generated.
+        :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
        """
        ...