feat: implement chunk deletion for vector stores (#2701)

Add support for deleting individual chunks from vector stores

- Add abstract remove_chunk() method to EmbeddingIndex base class
- Implement chunk deletion for Faiss provider, SQLite Vec, Milvus,
PGVector
- Placeholder implementations with NotImplementedError for
Chroma/Qdrant/Weaviate
- Integrate chunk deletion into OpenAI vector store file deletion flow
- removed xfail from
test_openai_vector_store_delete_file_removes_from_vector_store

Closes: #2477

---------

Signed-off-by: Derek Higgins <derekh@redhat.com>
Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com>
This commit is contained in:
Derek Higgins 2025-07-25 15:30:30 +01:00 committed by GitHub
parent 9e77be1f72
commit 52201612de
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 148 additions and 12 deletions

View file

@ -152,6 +152,11 @@ class OpenAIVectorStoreMixin(ABC):
"""Load existing OpenAI vector stores into the in-memory cache."""
self.openai_vector_stores = await self._load_openai_vector_stores()
@abstractmethod
async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
"""Delete a chunk from a vector store."""
pass
@abstractmethod
async def register_vector_db(self, vector_db: VectorDB) -> None:
"""Register a vector database (provider-specific implementation)."""
@ -763,17 +768,15 @@ class OpenAIVectorStoreMixin(ABC):
if vector_store_id not in self.openai_vector_stores:
raise ValueError(f"Vector store {vector_store_id} not found")
dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id)
chunks = [Chunk.model_validate(c) for c in dict_chunks]
await self.delete_chunks(vector_store_id, [str(c.chunk_id) for c in chunks if c.chunk_id])
store_info = self.openai_vector_stores[vector_store_id].copy()
file = await self.openai_retrieve_vector_store_file(vector_store_id, file_id)
await self._delete_openai_vector_store_file_from_storage(vector_store_id, file_id)
# TODO: We need to actually delete the embeddings from the underlying vector store...
# Also uncomment the corresponding integration test marked as xfail
#
# test_openai_vector_store_delete_file_removes_from_vector_store in
# tests/integration/vector_io/test_openai_vector_stores.py
# Update in-memory cache
store_info["file_ids"].remove(file_id)
store_info["file_counts"][file.status] -= 1

View file

@ -231,6 +231,10 @@ class EmbeddingIndex(ABC):
async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
raise NotImplementedError()
@abstractmethod
async def delete_chunk(self, chunk_id: str):
raise NotImplementedError()
@abstractmethod
async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
raise NotImplementedError()