feat(api): (1/n) datasets api clean up (#1573)

## PR Stack
- https://github.com/meta-llama/llama-stack/pull/1573
- https://github.com/meta-llama/llama-stack/pull/1625
- https://github.com/meta-llama/llama-stack/pull/1656
- https://github.com/meta-llama/llama-stack/pull/1657
- https://github.com/meta-llama/llama-stack/pull/1658
- https://github.com/meta-llama/llama-stack/pull/1659
- https://github.com/meta-llama/llama-stack/pull/1660

**Client SDK**
- https://github.com/meta-llama/llama-stack-client-python/pull/203

**CI**
- 1391130488
<img width="1042" alt="image"
src="https://github.com/user-attachments/assets/69636067-376d-436b-9204-896e2dd490ca"
/>
-- the test_rag_agent_with_attachments is flaky and not related to this
PR

## Doc
<img width="789" alt="image"
src="https://github.com/user-attachments/assets/b88390f3-73d6-4483-b09a-a192064e32d9"
/>


## Client Usage
```python
client.datasets.register(
    source={
        "type": "uri",
        "uri": "lsfs://mydata.jsonl",
    },
    schema="jsonl_messages",
    # optional 
    dataset_id="my_first_train_data"
)

# quick prototype debugging
client.datasets.register(
    data_reference={
        "type": "rows",
        "rows": [
                "messages": [...],
        ],
    },
    schema="jsonl_messages",
)
```

## Test Plan
- CI:
1387805545

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/datasets/test_datasets.py
```

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring/test_scoring.py
```

```
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
```
This commit is contained in:
Xi Yan 2025-03-17 16:55:45 -07:00 committed by GitHub
parent 3b35a39b8b
commit 5287b437ae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 2593 additions and 2296 deletions

View file

@ -40,75 +40,7 @@
}
],
"paths": {
"/v1/datasetio/rows": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/PaginatedRowsResult"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"DatasetIO"
],
"description": "Get a paginated list of rows from a dataset.",
"parameters": [
{
"name": "dataset_id",
"in": "query",
"description": "The ID of the dataset to get the rows from.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "rows_in_page",
"in": "query",
"description": "The number of rows to get per page.",
"required": true,
"schema": {
"type": "integer"
}
},
{
"name": "page_token",
"in": "query",
"description": "The token to get the next page of rows.",
"required": false,
"schema": {
"type": "string"
}
},
{
"name": "filter_condition",
"in": "query",
"description": "(Optional) A condition to filter the rows by.",
"required": false,
"schema": {
"type": "string"
}
}
]
},
"/v1/datasetio/append-rows/{dataset_id}": {
"post": {
"responses": {
"200": {
@ -131,7 +63,16 @@
"DatasetIO"
],
"description": "",
"parameters": [],
"parameters": [
{
"name": "dataset_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
@ -583,7 +524,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "List all buckets.",
"parameters": [
@ -623,7 +564,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "Create a new upload session for a file identified by a bucket and key.",
"parameters": [],
@ -850,7 +791,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "Get a file info identified by a bucket and key.",
"parameters": [
@ -900,7 +841,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "Delete a file identified by a bucket and key.",
"parameters": [
@ -1889,7 +1830,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "Returns information about an existsing upload session",
"parameters": [
@ -1937,7 +1878,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "Upload file content to an existing upload session. On the server, request body will have the raw bytes that are uploaded.",
"parameters": [
@ -2236,6 +2177,67 @@
}
}
},
"/v1/datasetio/iterrows/{dataset_id}": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/IterrowsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"DatasetIO"
],
"description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
"parameters": [
{
"name": "dataset_id",
"in": "path",
"description": "The ID of the dataset to get the rows from.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "start_index",
"in": "query",
"description": "Index into dataset for the first row to get. Get all rows if None.",
"required": false,
"schema": {
"type": "integer"
}
},
{
"name": "limit",
"in": "query",
"description": "The number of rows to get.",
"required": false,
"schema": {
"type": "integer"
}
}
]
}
},
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
@ -2535,7 +2537,14 @@
"post": {
"responses": {
"200": {
"description": "OK"
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Dataset"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
@ -2553,7 +2562,7 @@
"tags": [
"Datasets"
],
"description": "",
"description": "Register a new dataset.",
"parameters": [],
"requestBody": {
"content": {
@ -2594,7 +2603,7 @@
}
},
"tags": [
"Files (Coming Soon)"
"Files"
],
"description": "List all files in a bucket.",
"parameters": [
@ -3824,9 +3833,6 @@
"AppendRowsRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"rows": {
"type": "array",
"items": {
@ -3858,7 +3864,6 @@
},
"additionalProperties": false,
"required": [
"dataset_id",
"rows"
],
"title": "AppendRowsRequest"
@ -6824,6 +6829,224 @@
],
"title": "Benchmark"
},
"DataSource": {
"oneOf": [
{
"$ref": "#/components/schemas/URIDataSource"
},
{
"$ref": "#/components/schemas/RowsDataSource"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"uri": "#/components/schemas/URIDataSource",
"rows": "#/components/schemas/RowsDataSource"
}
}
},
"Dataset": {
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"provider_resource_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"type": {
"type": "string",
"const": "dataset",
"default": "dataset"
},
"purpose": {
"type": "string",
"enum": [
"post-training/messages",
"eval/question-answer",
"eval/messages-answer"
],
"title": "DatasetPurpose",
"description": "Purpose of the dataset. Each purpose has a required input data schema."
},
"source": {
"$ref": "#/components/schemas/DataSource"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"additionalProperties": false,
"required": [
"identifier",
"provider_resource_id",
"provider_id",
"type",
"purpose",
"source",
"metadata"
],
"title": "Dataset"
},
"RowsDataSource": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "rows",
"default": "rows"
},
"rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]"
}
},
"additionalProperties": false,
"required": [
"type",
"rows"
],
"title": "RowsDataSource",
"description": "A dataset stored in rows."
},
"URIDataSource": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "uri",
"default": "uri"
},
"uri": {
"type": "string",
"description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\""
}
},
"additionalProperties": false,
"required": [
"type",
"uri"
],
"title": "URIDataSource",
"description": "A dataset that can be obtained from a URI."
},
"Model": {
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"provider_resource_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"type": {
"type": "string",
"const": "model",
"default": "model"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"model_type": {
"$ref": "#/components/schemas/ModelType",
"default": "llm"
}
},
"additionalProperties": false,
"required": [
"identifier",
"provider_resource_id",
"provider_id",
"type",
"metadata",
"model_type"
],
"title": "Model"
},
"ModelType": {
"type": "string",
"enum": [
"llm",
"embedding"
],
"title": "ModelType"
},
"AgentTurnInputType": {
"type": "object",
"properties": {
@ -6899,70 +7122,6 @@
],
"title": "CompletionInputType"
},
"Dataset": {
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"provider_resource_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"type": {
"type": "string",
"const": "dataset",
"default": "dataset"
},
"dataset_schema": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ParamType"
}
},
"url": {
"$ref": "#/components/schemas/URL"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"additionalProperties": false,
"required": [
"identifier",
"provider_resource_id",
"provider_id",
"type",
"dataset_schema",
"url",
"metadata"
],
"title": "Dataset"
},
"JsonType": {
"type": "object",
"properties": {
@ -7057,151 +7216,6 @@
}
}
},
"StringType": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "string",
"default": "string"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "StringType"
},
"UnionType": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "union",
"default": "union"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "UnionType"
},
"Model": {
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"provider_resource_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"type": {
"type": "string",
"const": "model",
"default": "model"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"model_type": {
"$ref": "#/components/schemas/ModelType",
"default": "llm"
}
},
"additionalProperties": false,
"required": [
"identifier",
"provider_resource_id",
"provider_id",
"type",
"metadata",
"model_type"
],
"title": "Model"
},
"ModelType": {
"type": "string",
"enum": [
"llm",
"embedding"
],
"title": "ModelType"
},
"PaginatedRowsResult": {
"type": "object",
"properties": {
"rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The rows in the current page."
},
"total_count": {
"type": "integer",
"description": "The total number of rows in the dataset."
},
"next_page_token": {
"type": "string",
"description": "The token to get the next page of rows."
}
},
"additionalProperties": false,
"required": [
"rows",
"total_count"
],
"title": "PaginatedRowsResult",
"description": "A paginated list of rows from a dataset."
},
"ScoringFn": {
"type": "object",
"properties": {
@ -7265,6 +7279,36 @@
],
"title": "ScoringFn"
},
"StringType": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "string",
"default": "string"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "StringType"
},
"UnionType": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "union",
"default": "union"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "UnionType"
},
"Shield": {
"type": "object",
"properties": {
@ -8084,6 +8128,50 @@
],
"title": "ToolInvocationResult"
},
"IterrowsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The rows in the current page."
},
"next_start_index": {
"type": "integer",
"description": "Index into dataset for the first row in the next page. None if there are no more rows."
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "IterrowsResponse",
"description": "A paginated list of rows from a dataset."
},
"ListAgentSessionsResponse": {
"type": "object",
"properties": {
@ -9330,23 +9418,18 @@
"RegisterDatasetRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
"purpose": {
"type": "string",
"enum": [
"post-training/messages",
"eval/question-answer",
"eval/messages-answer"
],
"description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/question-answer\": The dataset contains a question column and an answer column for evaluation. { \"question\": \"What is the capital of France?\", \"answer\": \"Paris\" } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column for evaluation. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, my name is John Doe.\"}, {\"role\": \"assistant\", \"content\": \"Hello, John Doe. How can I help you today?\"}, {\"role\": \"user\", \"content\": \"What's my name?\"}, ], \"answer\": \"John Doe\" }"
},
"dataset_schema": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ParamType"
}
},
"url": {
"$ref": "#/components/schemas/URL"
},
"provider_dataset_id": {
"type": "string"
},
"provider_id": {
"type": "string"
"source": {
"$ref": "#/components/schemas/DataSource",
"description": "The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
},
"metadata": {
"type": "object",
@ -9371,14 +9454,18 @@
"type": "object"
}
]
}
},
"description": "The metadata for the dataset. - E.g. {\"description\": \"My dataset\"}"
},
"dataset_id": {
"type": "string",
"description": "The ID of the dataset. If not provided, an ID will be generated."
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"dataset_schema",
"url"
"purpose",
"source"
],
"title": "RegisterDatasetRequest"
},
@ -10197,7 +10284,7 @@
"x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
},
{
"name": "Files (Coming Soon)"
"name": "Files"
},
{
"name": "Inference",
@ -10258,7 +10345,7 @@
"DatasetIO",
"Datasets",
"Eval",
"Files (Coming Soon)",
"Files",
"Inference",
"Inspect",
"Models",