From 788d34d8b407e25db6d58d1264b47a2781bc131a Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Wed, 11 Jun 2025 20:06:30 -0400
Subject: [PATCH] Move vector file attach code to OpenAIVectorStoreMixin

This moves the vector store file attach code to the
OpenAIVectorStoreMixin. It also centralizes the mime type and pdf
parsing logic into the existing functions in vector_store.py by making
a small refactor there to allow us to use the same code path.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../providers/inline/vector_io/faiss/faiss.py | 82 -----------------
 .../utils/memory/openai_vector_store_mixin.py | 89 +++++++++++++++++++
 .../providers/utils/memory/vector_store.py    | 16 ++--
 .../openai_api/test_responses.py              |  2 +-
 4 files changed, 99 insertions(+), 90 deletions(-)

diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index b1326c06f..69b7cc22d 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -9,8 +9,6 @@ import base64
 import io
 import json
 import logging
-import mimetypes
-import time
 from typing import Any
 
 import faiss
@@ -26,13 +24,6 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
-from llama_stack.apis.vector_io.vector_io import (
-    VectorStoreChunkingStrategy,
-    VectorStoreChunkingStrategyAuto,
-    VectorStoreChunkingStrategyStatic,
-    VectorStoreFileLastError,
-    VectorStoreFileObject,
-)
 from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
@@ -40,8 +31,6 @@ from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIV
 from llama_stack.providers.utils.memory.vector_store import (
     EmbeddingIndex,
     VectorDBWithIndex,
-    make_overlapped_chunks,
-    parse_pdf,
 )
 
 from .config import FaissVectorIOConfig
@@ -263,74 +252,3 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
         assert self.kvstore is not None
         key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
         await self.kvstore.delete(key)
-
-    async def openai_attach_file_to_vector_store(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any] | None = None,
-        chunking_strategy: VectorStoreChunkingStrategy | None = None,
-    ) -> VectorStoreFileObject:
-        attributes = attributes or {}
-        chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
-
-        vector_store_file_object = VectorStoreFileObject(
-            id=file_id,
-            attributes=attributes,
-            chunking_strategy=chunking_strategy,
-            created_at=int(time.time()),
-            status="in_progress",
-            vector_store_id=vector_store_id,
-        )
-
-        if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
-            max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
-            chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
-        else:
-            # Default values from OpenAI API docs
-            max_chunk_size_tokens = 800
-            chunk_overlap_tokens = 400
-
-        try:
-            file_response = await self.files_api.openai_retrieve_file(file_id)
-            mime_type, _ = mimetypes.guess_type(file_response.filename)
-            content_response = await self.files_api.openai_retrieve_file_content(file_id)
-
-            # TODO: We can't use content_from_doc directly from vector_store
-            # but should figure out how to centralize this logic near there
-            if mime_type == "application/pdf":
-                content = parse_pdf(content_response.body)
-            else:
-                content = content_response.body.decode("utf-8")
-
-            chunks = make_overlapped_chunks(
-                file_id,
-                content,
-                max_chunk_size_tokens,
-                chunk_overlap_tokens,
-                attributes,
-            )
-
-            if not chunks:
-                vector_store_file_object.status = "failed"
-                vector_store_file_object.last_error = VectorStoreFileLastError(
-                    code="server_error",
-                    message="No chunks were generated from the file",
-                )
-                return vector_store_file_object
-
-            await self.insert_chunks(
-                vector_db_id=vector_store_id,
-                chunks=chunks,
-            )
-        except Exception as e:
-            vector_store_file_object.status = "failed"
-            vector_store_file_object.last_error = VectorStoreFileLastError(
-                code="server_error",
-                message=str(e),
-            )
-            return vector_store_file_object
-
-        vector_store_file_object.status = "completed"
-
-        return vector_store_file_object
diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
index 7d8163ed2..4849216a9 100644
--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@@ -5,11 +5,13 @@
 # the root directory of this source tree.
 
 import logging
+import mimetypes
 import time
 import uuid
 from abc import ABC, abstractmethod
 from typing import Any
 
+from llama_stack.apis.files import Files
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import (
     QueryChunksResponse,
@@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
     VectorStoreSearchResponse,
     VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import (
+    Chunk,
+    VectorStoreChunkingStrategy,
+    VectorStoreChunkingStrategyAuto,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreFileLastError,
+    VectorStoreFileObject,
+)
+from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):
 
     # These should be provided by the implementing class
     openai_vector_stores: dict[str, dict[str, Any]]
+    files_api: Files
 
     @abstractmethod
     async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
@@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
         """Unregister a vector database (provider-specific implementation)."""
         pass
 
+    @abstractmethod
+    async def insert_chunks(
+        self,
+        vector_db_id: str,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
+    ) -> None:
+        """Insert chunks into a vector database (provider-specific implementation)."""
+        pass
+
     @abstractmethod
     async def query_chunks(
         self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
@@ -383,3 +405,70 @@ class OpenAIVectorStoreMixin(ABC):
             if metadata[key] != value:
                 return False
         return True
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        attributes = attributes or {}
+        chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
+
+        vector_store_file_object = VectorStoreFileObject(
+            id=file_id,
+            attributes=attributes,
+            chunking_strategy=chunking_strategy,
+            created_at=int(time.time()),
+            status="in_progress",
+            vector_store_id=vector_store_id,
+        )
+
+        if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
+            max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
+            chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
+        else:
+            # Default values from OpenAI API spec
+            max_chunk_size_tokens = 800
+            chunk_overlap_tokens = 400
+
+        try:
+            file_response = await self.files_api.openai_retrieve_file(file_id)
+            mime_type, _ = mimetypes.guess_type(file_response.filename)
+            content_response = await self.files_api.openai_retrieve_file_content(file_id)
+
+            content = content_from_data_and_mime_type(content_response.body, mime_type)
+
+            chunks = make_overlapped_chunks(
+                file_id,
+                content,
+                max_chunk_size_tokens,
+                chunk_overlap_tokens,
+                attributes,
+            )
+
+            if not chunks:
+                vector_store_file_object.status = "failed"
+                vector_store_file_object.last_error = VectorStoreFileLastError(
+                    code="server_error",
+                    message="No chunks were generated from the file",
+                )
+                return vector_store_file_object
+
+            await self.insert_chunks(
+                vector_db_id=vector_store_id,
+                chunks=chunks,
+            )
+        except Exception as e:
+            logger.error(f"Error attaching file to vector store: {e}")
+            vector_store_file_object.status = "failed"
+            vector_store_file_object.last_error = VectorStoreFileLastError(
+                code="server_error",
+                message=str(e),
+            )
+            return vector_store_file_object
+
+        vector_store_file_object.status = "completed"
+
+        return vector_store_file_object
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 4cd15860b..2c0c7c8e9 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -72,16 +72,18 @@ def content_from_data(data_url: str) -> str:
         data = unquote(data)
         encoding = parts["encoding"] or "utf-8"
         data = data.encode(encoding)
+    return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))
 
-    encoding = parts["encoding"]
-    if not encoding:
-        import chardet
 
-        detected = chardet.detect(data)
-        encoding = detected["encoding"]
+def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
+    if isinstance(data, bytes):
+        if not encoding:
+            import chardet
 
-    mime_type = parts["mimetype"]
-    mime_category = mime_type.split("/")[0]
+            detected = chardet.detect(data)
+            encoding = detected["encoding"]
+
+    mime_category = mime_type.split("/")[0] if mime_type else None
     if mime_category == "text":
         # For text-based files (including CSV, MD)
         return data.decode(encoding)
diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py
index 66eada4ba..5b166e465 100644
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@@ -327,7 +327,7 @@ def test_response_non_streaming_file_search(
             vector_store_id=vector_store.id,
             file_id=file_response.id,
         )
-    assert file_attach_response.status == "completed"
+    assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}"
     assert not file_attach_response.last_error
 
     # Update our tools with the right vector store id