Merge remote-tracking branch 'origin/main' into if_eval

2026-01-01 21:30:01 +00:00 · 2025-03-18 23:23:13 -07:00 · 2025-03-18 23:23:13 -07:00 · a690c7b230
commit a690c7b230
parent 91ef7081d8 7c0448456e
123 changed files with 4482 additions and 3161 deletions
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -52,7 +52,7 @@ class Benchmarks(Protocol):
    async def get_benchmark(
        self,
        benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
+    ) -> Benchmark: ...

    @webmethod(route="/eval/benchmarks", method="POST")
    async def register_benchmark(
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -13,19 +13,16 @@ from llama_stack.schema_utils import json_schema_type, webmethod


@json_schema_type
-class PaginatedRowsResult(BaseModel):
+class IterrowsResponse(BaseModel):
    """
    A paginated list of rows from a dataset.

-    :param rows: The rows in the current page.
-    :param total_count: The total number of rows in the dataset.
-    :param next_page_token: The token to get the next page of rows.
+    :param data: The rows in the current page.
+    :param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
    """

-    # the rows obey the DatasetSchema for the given dataset
-    rows: List[Dict[str, Any]]
-    total_count: int
-    next_page_token: Optional[str] = None
+    data: List[Dict[str, Any]]
+    next_start_index: Optional[int] = None


 class DatasetStore(Protocol):
@ -37,22 +34,21 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/rows", method="GET")
-    async def get_rows_paginated(
+    # TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing
+    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
+    async def iterrows(
        self,
        dataset_id: str,
-        rows_in_page: int,
-        page_token: Optional[str] = None,
-        filter_condition: Optional[str] = None,
-    ) -> PaginatedRowsResult:
-        """Get a paginated list of rows from a dataset.
+        start_index: Optional[int] = None,
+        limit: Optional[int] = None,
+    ) -> IterrowsResponse:
+        """Get a paginated list of rows from a dataset. Uses cursor-based pagination.

        :param dataset_id: The ID of the dataset to get the rows from.
-        :param rows_in_page: The number of rows to get per page.
-        :param page_token: The token to get the next page of rows.
-        :param filter_condition: (Optional) A condition to filter the rows by.
+        :param start_index: Index into dataset for the first row to get. Get all rows if None.
+        :param limit: The number of rows to get.
        """
        ...

-    @webmethod(route="/datasetio/rows", method="POST")
+    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -4,19 +4,102 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, List, Literal, Optional, Protocol
+from enum import Enum
+from typing import Annotated, Any, Dict, List, Literal, Optional, Protocol, Union

 from pydantic import BaseModel, Field

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+
+
+class DatasetPurpose(str, Enum):
+    """
+    Purpose of the dataset. Each purpose has a required input data schema.
+
+    :cvar post-training/messages: The dataset contains messages used for post-training.
+        {
+            "messages": [
+                {"role": "user", "content": "Hello, world!"},
+                {"role": "assistant", "content": "Hello, world!"},
+            ]
+        }
+    :cvar eval/question-answer: The dataset contains a question column and an answer column.
+        {
+            "question": "What is the capital of France?",
+            "answer": "Paris"
+        }
+    :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column.
+        {
+            "messages": [
+                {"role": "user", "content": "Hello, my name is John Doe."},
+                {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
+                {"role": "user", "content": "What's my name?"},
+            ],
+            "answer": "John Doe"
+        }
+    """
+
+    post_training_messages = "post-training/messages"
+    eval_question_answer = "eval/question-answer"
+    eval_messages_answer = "eval/messages-answer"
+
+    # TODO: add more schemas here
+
+
+class DatasetType(Enum):
+    """
+    Type of the dataset source.
+    :cvar uri: The dataset can be obtained from a URI.
+    :cvar rows: The dataset is stored in rows.
+    """
+
+    uri = "uri"
+    rows = "rows"
+
+
+@json_schema_type
+class URIDataSource(BaseModel):
+    """A dataset that can be obtained from a URI.
+    :param uri: The dataset can be obtained from a URI. E.g.
+        - "https://mywebsite.com/mydata.jsonl"
+        - "lsfs://mydata.jsonl"
+        - "data:csv;base64,{base64_content}"
+    """
+
+    type: Literal["uri"] = "uri"
+    uri: str
+
+
+@json_schema_type
+class RowsDataSource(BaseModel):
+    """A dataset stored in rows.
+    :param rows: The dataset is stored in rows. E.g.
+        - [
+            {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
+        ]
+    """
+
+    type: Literal["rows"] = "rows"
+    rows: List[Dict[str, Any]]
+
+
+DataSource = register_schema(
+    Annotated[
+        Union[URIDataSource, RowsDataSource],
+        Field(discriminator="type"),
+    ],
+    name="DataSource",
+)


 class CommonDatasetFields(BaseModel):
-    dataset_schema: Dict[str, ParamType]
-    url: URL
+    """
+    Common fields for a dataset.
+    """
+
+    purpose: DatasetPurpose
+    source: DataSource
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="Any additional metadata for this dataset",
@ -50,19 +133,75 @@ class Datasets(Protocol):
    @webmethod(route="/datasets", method="POST")
    async def register_dataset(
        self,
-        dataset_id: str,
-        dataset_schema: Dict[str, ParamType],
-        url: URL,
-        provider_dataset_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
+        purpose: DatasetPurpose,
+        source: DataSource,
        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
+        dataset_id: Optional[str] = None,
+    ) -> Dataset:
+        """
+        Register a new dataset.
+
+        :param purpose: The purpose of the dataset. One of
+            - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
+                {
+                    "messages": [
+                        {"role": "user", "content": "Hello, world!"},
+                        {"role": "assistant", "content": "Hello, world!"},
+                    ]
+                }
+            - "eval/question-answer": The dataset contains a question column and an answer column for evaluation.
+                {
+                    "question": "What is the capital of France?",
+                    "answer": "Paris"
+                }
+            - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column for evaluation.
+                {
+                    "messages": [
+                        {"role": "user", "content": "Hello, my name is John Doe."},
+                        {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
+                        {"role": "user", "content": "What's my name?"},
+                    ],
+                    "answer": "John Doe"
+                }
+        :param source: The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples:
+           - {
+               "type": "uri",
+               "uri": "https://mywebsite.com/mydata.jsonl"
+           }
+           - {
+               "type": "uri",
+               "uri": "lsfs://mydata.jsonl"
+           }
+           - {
+               "type": "uri",
+               "uri": "data:csv;base64,{base64_content}"
+           }
+           - {
+               "type": "uri",
+               "uri": "huggingface://llamastack/simpleqa?split=train"
+           }
+           - {
+               "type": "rows",
+               "rows": [
+                   {
+                       "messages": [
+                           {"role": "user", "content": "Hello, world!"},
+                           {"role": "assistant", "content": "Hello, world!"},
+                       ]
+                   }
+               ]
+           }
+        :param metadata: The metadata for the dataset.
+           - E.g. {"description": "My dataset"}
+        :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
+        """
+        ...

    @webmethod(route="/datasets/{dataset_id:path}", method="GET")
    async def get_dataset(
        self,
        dataset_id: str,
-    ) -> Optional[Dataset]: ...
+    ) -> Dataset: ...

    @webmethod(route="/datasets", method="GET")
    async def list_datasets(self) -> ListDatasetsResponse: ...
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -117,7 +117,7 @@ class Eval(Protocol):
        """

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> JobStatus:
        """Get the status of a job.

        :param benchmark_id: The ID of the benchmark to run the evaluation on.
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -115,7 +115,7 @@ class Files(Protocol):
    async def get_upload_session_info(
        self,
        upload_id: str,
-    ) -> Optional[FileUploadResponse]:
+    ) -> FileUploadResponse:
        """
        Returns information about an existsing upload session

--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -66,7 +66,7 @@ class Models(Protocol):
    async def get_model(
        self,
        model_id: str,
-    ) -> Optional[Model]: ...
+    ) -> Model: ...

    @webmethod(route="/models", method="POST")
    async def register_model(
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -6,7 +6,7 @@

 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+from typing import Any, Dict, List, Literal, Optional, Protocol

 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
@ -89,7 +89,7 @@ class QATFinetuningConfig(BaseModel):


 AlgorithmConfig = register_schema(
-    Annotated[Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")],
+    Annotated[LoraFinetuningConfig | QATFinetuningConfig, Field(discriminator="type")],
    name="AlgorithmConfig",
 )

@ -184,7 +184,7 @@ class PostTraining(Protocol):
            description="Model descriptor from `llama model list`",
        ),
        checkpoint_dir: Optional[str] = None,
-        algorithm_config: Optional[AlgorithmConfig] = None,
+        algorithm_config: Optional[LoraFinetuningConfig | QATFinetuningConfig] = None,
    ) -> PostTrainingJob: ...

    @webmethod(route="/post-training/preference-optimize", method="POST")
@ -202,10 +202,10 @@ class PostTraining(Protocol):
    async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...

    @webmethod(route="/post-training/job/status", method="GET")
-    async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: ...
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse: ...

    @webmethod(route="/post-training/job/cancel", method="POST")
    async def cancel_training_job(self, job_uuid: str) -> None: ...

    @webmethod(route="/post-training/job/artifacts", method="GET")
-    async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: ...
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: ...
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -136,7 +136,7 @@ class ScoringFunctions(Protocol):
    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...

    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...
+    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ...

    @webmethod(route="/scoring-functions", method="POST")
    async def register_scoring_function(
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -49,7 +49,7 @@ class Shields(Protocol):
    async def list_shields(self) -> ListShieldsResponse: ...

    @webmethod(route="/shields/{identifier:path}", method="GET")
-    async def get_shield(self, identifier: str) -> Optional[Shield]: ...
+    async def get_shield(self, identifier: str) -> Shield: ...

    @webmethod(route="/shields", method="POST")
    async def register_shield(
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -50,7 +50,7 @@ class VectorDBs(Protocol):
    async def get_vector_db(
        self,
        vector_db_id: str,
-    ) -> Optional[VectorDB]: ...
+    ) -> VectorDB: ...

    @webmethod(route="/vector-dbs", method="POST")
    async def register_vector_db(