mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-09 23:25:58 +00:00
Add pdf support to file_search for Responses API
This adds basic PDF support (using our existing `parse_pdf` function) to the file_search tool and corresponding Vector Files API. When a PDF file is uploaded and attached to a vector store, we parse the pdf and then chunk its content as normal. This is not the best solution long-term, but it does match what we've been doing so far for PDF files in the memory tool. Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
57eccf023d
commit
055885bd5a
4 changed files with 41 additions and 33 deletions
|
@ -9,6 +9,7 @@ import base64
|
|||
import io
|
||||
import json
|
||||
import logging
|
||||
import mimetypes
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
@ -19,7 +20,6 @@ from numpy.typing import NDArray
|
|||
from llama_stack.apis.files import Files
|
||||
from llama_stack.apis.inference import InterleavedContent
|
||||
from llama_stack.apis.inference.inference import Inference
|
||||
from llama_stack.apis.tools.rag_tool import RAGDocument
|
||||
from llama_stack.apis.vector_dbs import VectorDB
|
||||
from llama_stack.apis.vector_io import (
|
||||
Chunk,
|
||||
|
@ -40,8 +40,8 @@ from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIV
|
|||
from llama_stack.providers.utils.memory.vector_store import (
|
||||
EmbeddingIndex,
|
||||
VectorDBWithIndex,
|
||||
content_from_doc,
|
||||
make_overlapped_chunks,
|
||||
parse_pdf,
|
||||
)
|
||||
|
||||
from .config import FaissVectorIOConfig
|
||||
|
@ -292,20 +292,23 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
|
|||
chunk_overlap_tokens = 400
|
||||
|
||||
try:
|
||||
file_response = await self.files_api.openai_retrieve_file(file_id)
|
||||
mime_type, _ = mimetypes.guess_type(file_response.filename)
|
||||
content_response = await self.files_api.openai_retrieve_file_content(file_id)
|
||||
content = content_response.body
|
||||
doc = RAGDocument(
|
||||
document_id=file_id,
|
||||
content=content,
|
||||
metadata=attributes,
|
||||
)
|
||||
content = await content_from_doc(doc)
|
||||
|
||||
# TODO: We can't use content_from_doc directly from vector_store
|
||||
# but should figure out how to centralize this logic near there
|
||||
if mime_type == "application/pdf":
|
||||
content = parse_pdf(content_response.body)
|
||||
else:
|
||||
content = content_response.body.decode("utf-8")
|
||||
|
||||
chunks = make_overlapped_chunks(
|
||||
doc.document_id,
|
||||
file_id,
|
||||
content,
|
||||
max_chunk_size_tokens,
|
||||
chunk_overlap_tokens,
|
||||
doc.metadata,
|
||||
attributes,
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue