fix: Call pandas.read_* in a seperate thread (#1698)

These block on io reads which in turn block the server. Move them to their own thread. Closes: #1697 # What does this PR do? To avoid blocking the main eventloop, updates datasetio/localfs to load data in a seperate thread Signed-off-by: Derek Higgins <derekh@redhat.com>
2025-06-28 19:04:19 +00:00 · 2025-03-19 17:46:37 +00:00 · 2025-03-19 17:46:37 +00:00 · 6949bd1999
commit 6949bd1999
parent 65ca85ba6b
2 changed files with 11 additions and 7 deletions
--- a/llama_stack/providers/utils/datasetio/url_utils.py
+++ b/llama_stack/providers/utils/datasetio/url_utils.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import base64
 import io
 from urllib.parse import unquote
@ -13,12 +14,15 @@ import pandas
 from llama_stack.providers.utils.memory.vector_store import parse_data_url


-def get_dataframe_from_uri(uri: str):
+async def get_dataframe_from_uri(uri: str):
    df = None
    if uri.endswith(".csv"):
-        df = pandas.read_csv(uri)
+        # Moving to its own thread to avoid io from blocking the eventloop
+        # This isn't ideal as it moves more then just the IO to a new thread
+        # but it is as close as we can easly get
+        df = await asyncio.to_thread(pandas.read_csv, uri)
    elif uri.endswith(".xlsx"):
-        df = pandas.read_excel(uri)
+        df = await asyncio.to_thread(pandas.read_excel, uri)
    elif uri.startswith("data:"):
        parts = parse_data_url(uri)
        data = parts["data"]