[#342] RAG - fix PDF format in vector database

2025-12-18 13:29:47 +00:00 · 2024-11-30 10:14:05 +00:00 · 2024-11-30 10:14:05 +00:00 · da035d69cf
commit da035d69cf
parent 2fc1c16d58
3 changed files with 94 additions and 4 deletions
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -45,6 +45,13 @@ def get_embedding_model(model: str) -> "SentenceTransformer":
    return loaded_model


+def parse_pdf(data: bytes) -> str:
+    # For PDF and DOC/DOCX files, we can't reliably convert to string
+    pdf_bytes = io.BytesIO(data)
+    pdf_reader = PdfReader(pdf_bytes)
+    return "\n".join([page.extract_text() for page in pdf_reader.pages])
+
+
 def parse_data_url(data_url: str):
    data_url_pattern = re.compile(
        r"^"
@ -88,10 +95,7 @@ def content_from_data(data_url: str) -> str:
        return data.decode(encoding)

    elif mime_type == "application/pdf":
-        # For PDF and DOC/DOCX files, we can't reliably convert to string)
-        pdf_bytes = io.BytesIO(data)
-        pdf_reader = PdfReader(pdf_bytes)
-        return "\n".join([page.extract_text() for page in pdf_reader.pages])
+        return parse_pdf(data)

    else:
        log.error("Could not extract content from data_url properly.")
@ -105,6 +109,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str:
        else:
            async with httpx.AsyncClient() as client:
                r = await client.get(doc.content.uri)
+            if doc.mime_type == "application/pdf":
+                return parse_pdf(r.content)
+            else:
                return r.text

    pattern = re.compile("^(https?://|file://|data:)")
@ -114,6 +121,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str:
        else:
            async with httpx.AsyncClient() as client:
                r = await client.get(doc.content)
+            if doc.mime_type == "application/pdf":
+                return parse_pdf(r.content)
+            else:
                return r.text

    return interleaved_text_media_as_str(doc.content)