wip huggingface register

2025-12-17 13:12:36 +00:00 · 2024-11-07 15:59:55 -08:00 · 2024-11-07 15:59:55 -08:00 · 37d87c585a
commit 37d87c585a
parent d1633dc412
5 changed files with 82 additions and 16 deletions
--- a/llama_stack/providers/utils/datasetio/init.py
+++ b/llama_stack/providers/utils/datasetio/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/utils/datasetio/url_utils.py
+++ b/llama_stack/providers/utils/datasetio/url_utils.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import mimetypes
+import os
+
+from llama_models.llama3.api.datatypes import URL
+
+
+def data_url_from_file(file_path: str) -> URL:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return URL(uri=data_url)
--- a/llama_stack/providers/utils/memory/file_utils.py
+++ b/llama_stack/providers/utils/memory/file_utils.py
@ -5,22 +5,41 @@
 # the root directory of this source tree.

 import base64
-import mimetypes
-import os
+import io
+from urllib.parse import unquote
+
+import pandas

 from llama_models.llama3.api.datatypes import URL

+from llama_stack.providers.utils.memory.vector_store import parse_data_url

-def data_url_from_file(file_path: str) -> URL:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")

-    with open(file_path, "rb") as file:
-        file_content = file.read()
+def get_dataframe_from_url(url: URL):
+    df = None
+    if url.uri.endswith(".csv"):
+        df = pandas.read_csv(url.uri)
+    elif url.uri.endswith(".xlsx"):
+        df = pandas.read_excel(url.uri)
+    elif url.uri.startswith("data:"):
+        parts = parse_data_url(url.uri)
+        data = parts["data"]
+        if parts["is_base64"]:
+            data = base64.b64decode(data)
+        else:
+            data = unquote(data)
+            encoding = parts["encoding"] or "utf-8"
+            data = data.encode(encoding)

-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
+        mime_type = parts["mimetype"]
+        mime_category = mime_type.split("/")[0]
+        data_bytes = io.BytesIO(data)

-    data_url = f"data:{mime_type};base64,{base64_content}"
+        if mime_category == "text":
+            df = pandas.read_csv(data_bytes)
+        else:
+            df = pandas.read_excel(data_bytes)
+    else:
+        raise ValueError(f"Unsupported file type: {url}")

-    return URL(uri=data_url)
+    return df