mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-18 13:29:47 +00:00
[#342] RAG - fix PDF format in vector database
This commit is contained in:
parent
2fc1c16d58
commit
da035d69cf
3 changed files with 94 additions and 4 deletions
|
|
@ -45,6 +45,13 @@ def get_embedding_model(model: str) -> "SentenceTransformer":
|
|||
return loaded_model
|
||||
|
||||
|
||||
def parse_pdf(data: bytes) -> str:
|
||||
# For PDF and DOC/DOCX files, we can't reliably convert to string
|
||||
pdf_bytes = io.BytesIO(data)
|
||||
pdf_reader = PdfReader(pdf_bytes)
|
||||
return "\n".join([page.extract_text() for page in pdf_reader.pages])
|
||||
|
||||
|
||||
def parse_data_url(data_url: str):
|
||||
data_url_pattern = re.compile(
|
||||
r"^"
|
||||
|
|
@ -88,10 +95,7 @@ def content_from_data(data_url: str) -> str:
|
|||
return data.decode(encoding)
|
||||
|
||||
elif mime_type == "application/pdf":
|
||||
# For PDF and DOC/DOCX files, we can't reliably convert to string)
|
||||
pdf_bytes = io.BytesIO(data)
|
||||
pdf_reader = PdfReader(pdf_bytes)
|
||||
return "\n".join([page.extract_text() for page in pdf_reader.pages])
|
||||
return parse_pdf(data)
|
||||
|
||||
else:
|
||||
log.error("Could not extract content from data_url properly.")
|
||||
|
|
@ -105,6 +109,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str:
|
|||
else:
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.get(doc.content.uri)
|
||||
if doc.mime_type == "application/pdf":
|
||||
return parse_pdf(r.content)
|
||||
else:
|
||||
return r.text
|
||||
|
||||
pattern = re.compile("^(https?://|file://|data:)")
|
||||
|
|
@ -114,6 +121,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str:
|
|||
else:
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.get(doc.content)
|
||||
if doc.mime_type == "application/pdf":
|
||||
return parse_pdf(r.content)
|
||||
else:
|
||||
return r.text
|
||||
|
||||
return interleaved_text_media_as_str(doc.content)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue