Add pdf support to file_search for Responses API

This adds basic PDF support (using our existing `parse_pdf` function) to the file_search tool and corresponding Vector Files API. When a PDF file is uploaded and attached to a vector store, we parse the pdf and then chunk its content as normal. This is not the best solution long-term, but it does match what we've been doing so far for PDF files in the memory tool. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-07-09 23:25:58 +00:00 · 2025-06-11 16:45:28 -04:00 · 2025-06-11 16:45:28 -04:00 · 055885bd5a
commit 055885bd5a
parent 57eccf023d
4 changed files with 41 additions and 33 deletions
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -9,6 +9,7 @@ import base64
 import io
 import json
 import logging
+import mimetypes
 import time
 from typing import Any

@ -19,7 +20,6 @@ from numpy.typing import NDArray
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.inference.inference import Inference
-from llama_stack.apis.tools.rag_tool import RAGDocument
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import (
    Chunk,
@ -40,8 +40,8 @@ from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIV
 from llama_stack.providers.utils.memory.vector_store import (
    EmbeddingIndex,
    VectorDBWithIndex,
-    content_from_doc,
    make_overlapped_chunks,
+    parse_pdf,
 )

 from .config import FaissVectorIOConfig
@ -292,20 +292,23 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
            chunk_overlap_tokens = 400

        try:
+            file_response = await self.files_api.openai_retrieve_file(file_id)
+            mime_type, _ = mimetypes.guess_type(file_response.filename)
            content_response = await self.files_api.openai_retrieve_file_content(file_id)
-            content = content_response.body
-            doc = RAGDocument(
-                document_id=file_id,
-                content=content,
-                metadata=attributes,
-            )
-            content = await content_from_doc(doc)
+
+            # TODO: We can't use content_from_doc directly from vector_store
+            # but should figure out how to centralize this logic near there
+            if mime_type == "application/pdf":
+                content = parse_pdf(content_response.body)
+            else:
+                content = content_response.body.decode("utf-8")
+
            chunks = make_overlapped_chunks(
-                doc.document_id,
+                file_id,
                content,
                max_chunk_size_tokens,
                chunk_overlap_tokens,
-                doc.metadata,
+                attributes,
            )

            if not chunks: