fix(vector-io): unify score calculation to use cosine and normalize to [0,1]

This commit is contained in:
ChristianZaccaria 2025-09-04 13:03:59 +01:00
parent 9618adba89
commit a0e0c7030b
9 changed files with 166 additions and 42 deletions

View file

@ -52,7 +52,7 @@ OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX = f"openai_vector_stores_files_conten
class FaissIndex(EmbeddingIndex):
def __init__(self, dimension: int, kvstore: KVStore | None = None, bank_id: str | None = None):
self.index = faiss.IndexFlatL2(dimension)
self.index = faiss.IndexFlatIP(dimension)
self.chunk_by_index: dict[int, Chunk] = {}
self.kvstore = kvstore
self.bank_id = bank_id
@ -122,8 +122,12 @@ class FaissIndex(EmbeddingIndex):
for i, chunk in enumerate(chunks):
self.chunk_by_index[indexlen + i] = chunk
# Normalize embeddings for cosine similarity
normalized_embeddings = np.array(embeddings).astype(np.float32)
faiss.normalize_L2(normalized_embeddings)
async with self.chunk_id_lock:
self.index.add(np.array(embeddings).astype(np.float32))
self.index.add(normalized_embeddings)
self.chunk_ids.extend([chunk.chunk_id for chunk in chunks])
# Save updated index
@ -160,18 +164,28 @@ class FaissIndex(EmbeddingIndex):
k: int,
score_threshold: float,
) -> QueryChunksResponse:
distances, indices = await asyncio.to_thread(self.index.search, embedding.reshape(1, -1).astype(np.float32), k)
logger.info(
f"FAISS VECTOR SEARCH CALLED: embedding_shape={embedding.shape}, k={k}, threshold={score_threshold}"
)
# Normalize query embedding for cosine similarity
query_embedding = embedding.reshape(1, -1).astype(np.float32)
faiss.normalize_L2(query_embedding)
distances, indices = await asyncio.to_thread(self.index.search, query_embedding, k)
chunks = []
scores = []
for d, i in zip(distances[0], indices[0], strict=False):
if i < 0:
continue
score = 1.0 / float(d) if d != 0 else float("inf")
# For IndexFlatIP with normalized vectors, d is cosine similarity in [-1,1]
score = (float(d) + 1.0) / 2.0 # rescale to [0,1]
logger.info(f"Computed score {score} from distance {d} for chunk id {self.chunk_ids[int(i)]}")
if score < score_threshold:
continue
chunks.append(self.chunk_by_index[int(i)])
scores.append(score)
logger.info(f"FAISS VECTOR SEARCH RESULTS: Found {len(chunks)} chunks with scores {scores}")
return QueryChunksResponse(chunks=chunks, scores=scores)
async def query_keyword(
@ -241,7 +255,7 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
"""
try:
vector_dimension = 128 # sample dimension
faiss.IndexFlatL2(vector_dimension)
faiss.IndexFlatIP(vector_dimension)
return HealthResponse(status=HealthStatus.OK)
except Exception as e:
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")