feat(dataset api): (1.1/n) dataset api implementation fix pre-commit (#1625)

# What does this PR do? - fix pre-commit with api updates [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` pre-commit ``` [//]: # (## Documentation)
2025-03-13 16:41:03 -07:00 · 2025-03-13 16:41:03 -07:00 · 7606e49dbc
commit 7606e49dbc
parent a6095820af
5 changed files with 42 additions and 97 deletions
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -55,6 +55,4 @@ class DatasetIO(Protocol):
        ...

    @webmethod(route="/datasets/{dataset_id}/rows", method="POST")
-    async def append_rows(
-        self, dataset_id: str, rows: List[Dict[str, Any]]
-    ) -> None: ...
+    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -13,7 +13,7 @@ from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


-class DatasetPurpose(Enum):
+class DatasetPurpose(str, Enum):
    """
    Purpose of the dataset. Each purpose has a required input data schema.

--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -6,7 +6,7 @@

 from typing import Dict, List, Tuple

-from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.datasets import DatasetPurpose, URIDataSource
 from llama_stack.apis.models.models import ModelType
 from llama_stack.distribution.datatypes import (
    BenchmarkInput,
@ -171,60 +171,34 @@ def get_distribution_template() -> DistributionTemplate:
        DatasetInput(
            dataset_id="simpleqa",
            provider_id="huggingface",
-            url=URL(uri="https://huggingface.co/datasets/llamastack/simpleqa"),
-            metadata={
-                "path": "llamastack/simpleqa",
-                "split": "train",
-            },
-            dataset_schema={
-                "input_query": {"type": "string"},
-                "expected_answer": {"type": "string"},
-                "chat_completion_input": {"type": "string"},
-            },
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://llamastack/simpleqa?split=train",
+            ),
        ),
        DatasetInput(
            dataset_id="mmlu_cot",
            provider_id="huggingface",
-            url=URL(uri="https://huggingface.co/datasets/llamastack/mmlu_cot"),
-            metadata={
-                "path": "llamastack/mmlu_cot",
-                "name": "all",
-                "split": "test",
-            },
-            dataset_schema={
-                "input_query": {"type": "string"},
-                "expected_answer": {"type": "string"},
-                "chat_completion_input": {"type": "string"},
-            },
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://llamastack/mmlu_cot?split=test&name=all",
+            ),
        ),
        DatasetInput(
            dataset_id="gpqa_cot",
            provider_id="huggingface",
-            url=URL(uri="https://huggingface.co/datasets/llamastack/gpqa_0shot_cot"),
-            metadata={
-                "path": "llamastack/gpqa_0shot_cot",
-                "name": "gpqa_main",
-                "split": "train",
-            },
-            dataset_schema={
-                "input_query": {"type": "string"},
-                "expected_answer": {"type": "string"},
-                "chat_completion_input": {"type": "string"},
-            },
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://llamastack/gpqa_0shot_cot?split=test&name=gpqa_main",
+            ),
        ),
        DatasetInput(
            dataset_id="math_500",
            provider_id="huggingface",
-            url=URL(uri="https://huggingface.co/datasets/llamastack/math_500"),
-            metadata={
-                "path": "llamastack/math_500",
-                "split": "test",
-            },
-            dataset_schema={
-                "input_query": {"type": "string"},
-                "expected_answer": {"type": "string"},
-                "chat_completion_input": {"type": "string"},
-            },
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://llamastack/math_500?split=test",
+            ),
        ),
    ]

--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -158,62 +158,32 @@ shields:
 - shield_id: meta-llama/Llama-Guard-3-8B
 vector_dbs: []
 datasets:
- dataset_schema:
-    input_query:
-      type: string
-    expected_answer:
-      type: string
-    chat_completion_input:
-      type: string
-  url:
-    uri: https://huggingface.co/datasets/llamastack/simpleqa
-  metadata:
-    path: llamastack/simpleqa
-    split: train
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://llamastack/simpleqa?split=train
+  metadata: {}
  dataset_id: simpleqa
  provider_id: huggingface
- dataset_schema:
-    input_query:
-      type: string
-    expected_answer:
-      type: string
-    chat_completion_input:
-      type: string
-  url:
-    uri: https://huggingface.co/datasets/llamastack/mmlu_cot
-  metadata:
-    path: llamastack/mmlu_cot
-    name: all
-    split: test
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://llamastack/mmlu_cot?split=test&name=all
+  metadata: {}
  dataset_id: mmlu_cot
  provider_id: huggingface
- dataset_schema:
-    input_query:
-      type: string
-    expected_answer:
-      type: string
-    chat_completion_input:
-      type: string
-  url:
-    uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot
-  metadata:
-    path: llamastack/gpqa_0shot_cot
-    name: gpqa_main
-    split: train
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://llamastack/gpqa_0shot_cot?split=test&name=gpqa_main
+  metadata: {}
  dataset_id: gpqa_cot
  provider_id: huggingface
- dataset_schema:
-    input_query:
-      type: string
-    expected_answer:
-      type: string
-    chat_completion_input:
-      type: string
-  url:
-    uri: https://huggingface.co/datasets/llamastack/math_500
-  metadata:
-    path: llamastack/math_500
-    split: test
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://llamastack/math_500?split=test
+  metadata: {}
  dataset_id: math_500
  provider_id: huggingface
 scoring_fns: []
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -11,6 +11,7 @@ import jinja2
 import yaml
 from pydantic import BaseModel, Field

+from llama_stack.apis.datasets import DatasetPurpose
 from llama_stack.apis.models.models import ModelType
 from llama_stack.distribution.datatypes import (
    Api,
@ -214,7 +215,9 @@ class DistributionTemplate(BaseModel):

        # Register YAML representer for ModelType
        yaml.add_representer(ModelType, enum_representer)
+        yaml.add_representer(DatasetPurpose, enum_representer)
        yaml.SafeDumper.add_representer(ModelType, enum_representer)
+        yaml.SafeDumper.add_representer(DatasetPurpose, enum_representer)

        for output_dir in [yaml_output_dir, doc_output_dir]:
            output_dir.mkdir(parents=True, exist_ok=True)