feat: implement chunk deletion for vector stores

Add support for deleting individual chunks from vector stores

- Add abstract delete_chunk() method to EmbeddingIndex base class
- Implement chunk deletion for Faiss provider with index tracking
- Add chunk_ids list to maintain chunk order in Faiss index
- Integrate chunk deletion into OpenAI vector store file deletion flow
- Add placeholder implementations for SQLite and Milvus providers

Closes: #2477

Signed-off-by: Derek Higgins <derekh@redhat.com>
This commit is contained in:
Derek Higgins 2025-07-09 12:19:50 +01:00
parent cd8715d327
commit 3d4b32db0a
5 changed files with 57 additions and 1 deletions

View file

@ -55,6 +55,11 @@ class FaissIndex(EmbeddingIndex):
self.kvstore = kvstore
self.bank_id = bank_id
# A list of chunk id's in the same order as they are in the index,
# must be updated when chunks are added or removed
self.chunk_id_lock = asyncio.Lock()
self.chunk_ids: list[Any] = []
@classmethod
async def create(cls, dimension: int, kvstore: KVStore | None = None, bank_id: str | None = None):
instance = cls(dimension, kvstore, bank_id)
@ -75,6 +80,7 @@ class FaissIndex(EmbeddingIndex):
buffer = io.BytesIO(base64.b64decode(data["faiss_index"]))
try:
self.index = faiss.deserialize_index(np.load(buffer, allow_pickle=False))
self.chunk_ids = [chunk.chunk_id for chunk in self.chunk_by_index.values()]
except Exception as e:
logger.debug(e, exc_info=True)
raise ValueError(
@ -114,11 +120,33 @@ class FaissIndex(EmbeddingIndex):
for i, chunk in enumerate(chunks):
self.chunk_by_index[indexlen + i] = chunk
self.index.add(np.array(embeddings).astype(np.float32))
async with self.chunk_id_lock:
self.index.add(np.array(embeddings).astype(np.float32))
self.chunk_ids.extend([chunk.chunk_id for chunk in chunks])
# Save updated index
await self._save_index()
async def delete_chunk(self, chunk_id: str) -> None:
if chunk_id not in self.chunk_ids:
return
async with self.chunk_id_lock:
index = self.chunk_ids.index(chunk_id)
self.index.remove_ids(np.array([index]))
new_chunk_by_index = {}
for idx, chunk in self.chunk_by_index.items():
# Shift all chunks after the removed chunk to the left
if idx > index:
new_chunk_by_index[idx - 1] = chunk
else:
new_chunk_by_index[idx] = chunk
self.chunk_by_index = new_chunk_by_index
self.chunk_ids.pop(index)
await self._save_index()
async def query_vector(
self,
embedding: NDArray,
@ -260,3 +288,8 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
raise ValueError(f"Vector DB {vector_db_id} not found")
return await index.query_chunks(query, params)
async def _delete_openai_chunk_from_vector_store(self, store_id: str, chunk_id: str) -> None:
"""Delete a chunk from a faiss index"""
faiss_index = self.cache[store_id].index
await faiss_index.delete_chunk(chunk_id)

View file

@ -520,3 +520,7 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
if not index:
raise ValueError(f"Vector DB {vector_db_id} not found")
return await index.query_chunks(query, params)
async def _delete_openai_chunk_from_vector_store(self, store_id: str, chunk_id: str) -> None:
"""Delete a chunk from a sqlite_vec index."""
pass # TODO