dataset accept file uploads

2025-12-11 11:50:41 +00:00 · 2024-10-14 23:36:15 -07:00 · 2024-10-14 23:36:15 -07:00 · ec6c63ba57
commit ec6c63ba57
parent 3c29108b6e
4 changed files with 58 additions and 9 deletions
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@ -3,10 +3,13 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import io
+
 import pandas
 from datasets import Dataset, load_dataset

 from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.providers.utils.memory.vector_store import parse_data_url


 class CustomDataset(BaseDataset[DictSample]):
@ -37,11 +40,31 @@ class CustomDataset(BaseDataset[DictSample]):
        if self.dataset:
            return

-        # TODO: better support w/ data url
+        # TODO: more robust support w/ data url
        if self.config.url.endswith(".csv"):
            df = pandas.read_csv(self.config.url)
        elif self.config.url.endswith(".xlsx"):
            df = pandas.read_excel(self.config.url)
+        elif self.config.url.startswith("data:"):
+            parts = parse_data_url(self.config.url)
+            data = parts["data"]
+            if parts["is_base64"]:
+                data = base64.b64decode(data)
+            else:
+                data = unquote(data)
+                encoding = parts["encoding"] or "utf-8"
+                data = data.encode(encoding)
+
+            mime_type = parts["mimetype"]
+            mime_category = mime_type.split("/")[0]
+            data_bytes = io.BytesIO(data)
+
+            if mime_category == "text":
+                df = pandas.read_csv(data_bytes)
+            else:
+                df = pandas.read_excel(data_bytes)
+        else:
+            raise ValueError(f"Unsupported file type: {self.config.url}")

        if n_samples is not None:
            df = df.sample(n=n_samples)