llama-stack/llama_stack/providers/utils/datasetio/url_utils.py
Xi Yan a568bf3f9d
feat(dataset api): (1.5/n) fix dataset registeration (#1659)
# What does this PR do?

- fix dataset registeration & iterrows
> NOTE: the URL endpoint is changed to datasetio due to flaky path
routing

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/datasets/test_datasets.py
```
<img width="854" alt="image"
src="https://github.com/user-attachments/assets/0168b352-1c5a-48d1-8e9a-93141d418e54"
/>


[//]: # (## Documentation)
2025-03-15 16:48:09 -07:00

43 lines
1.2 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
import io
from urllib.parse import unquote
import pandas
from llama_stack.providers.utils.memory.vector_store import parse_data_url
def get_dataframe_from_uri(uri: str):
df = None
if uri.endswith(".csv"):
df = pandas.read_csv(uri)
elif uri.endswith(".xlsx"):
df = pandas.read_excel(uri)
elif uri.startswith("data:"):
parts = parse_data_url(uri)
data = parts["data"]
if parts["is_base64"]:
data = base64.b64decode(data)
else:
data = unquote(data)
encoding = parts["encoding"] or "utf-8"
data = data.encode(encoding)
mime_type = parts["mimetype"]
mime_category = mime_type.split("/")[0]
data_bytes = io.BytesIO(data)
if mime_category == "text":
df = pandas.read_csv(data_bytes)
else:
df = pandas.read_excel(data_bytes)
else:
raise ValueError(f"Unsupported file type: {uri}")
return df