[memory refactor][2/n] Update faiss and make it pass tests (#830)

See https://github.com/meta-llama/llama-stack/issues/827 for the broader
design.

Second part:

- updates routing table / router code 
- updates the faiss implementation


## Test Plan

```
pytest -s -v -k sentence test_vector_io.py --env EMBEDDING_DIMENSION=384
```
This commit is contained in:
Ashwin Bharambe 2025-01-22 10:02:15 -08:00 committed by GitHub
parent 3ae8585b65
commit 78a481bb22
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 343 additions and 353 deletions

View file

@ -18,6 +18,8 @@ import numpy as np
from llama_models.llama3.api.tokenizer import Tokenizer
from numpy.typing import NDArray
from pydantic import BaseModel, Field
from pypdf import PdfReader
from llama_stack.apis.common.content_types import (
@ -25,16 +27,24 @@ from llama_stack.apis.common.content_types import (
TextContentItem,
URL,
)
from llama_stack.apis.memory import Chunk, MemoryBankDocument, QueryDocumentsResponse
from llama_stack.apis.memory_banks import VectorMemoryBank
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
from llama_stack.providers.datatypes import Api
from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str,
)
log = logging.getLogger(__name__)
class MemoryBankDocument(BaseModel):
document_id: str
content: InterleavedContent | URL
mime_type: str | None = None
metadata: Dict[str, Any] = Field(default_factory=dict)
def parse_pdf(data: bytes) -> str:
# For PDF and DOC/DOCX files, we can't reliably convert to string
pdf_bytes = io.BytesIO(data)
@ -165,7 +175,7 @@ class EmbeddingIndex(ABC):
@abstractmethod
async def query(
self, embedding: NDArray, k: int, score_threshold: float
) -> QueryDocumentsResponse:
) -> QueryChunksResponse:
raise NotImplementedError()
@abstractmethod
@ -174,56 +184,35 @@ class EmbeddingIndex(ABC):
@dataclass
class BankWithIndex:
bank: VectorMemoryBank
class VectorDBWithIndex:
vector_db: VectorDB
index: EmbeddingIndex
inference_api: Api.inference
async def insert_documents(
async def insert_chunks(
self,
documents: List[MemoryBankDocument],
chunks: List[Chunk],
) -> None:
for doc in documents:
content = await content_from_doc(doc)
chunks = make_overlapped_chunks(
doc.document_id,
content,
self.bank.chunk_size_in_tokens,
self.bank.overlap_size_in_tokens
or (self.bank.chunk_size_in_tokens // 4),
)
if not chunks:
continue
embeddings_response = await self.inference_api.embeddings(
self.bank.embedding_model, [x.content for x in chunks]
)
embeddings = np.array(embeddings_response.embeddings)
embeddings_response = await self.inference_api.embeddings(
self.vector_db.embedding_model, [x.content for x in chunks]
)
embeddings = np.array(embeddings_response.embeddings)
await self.index.add_chunks(chunks, embeddings)
await self.index.add_chunks(chunks, embeddings)
async def query_documents(
async def query_chunks(
self,
query: InterleavedContent,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse:
) -> QueryChunksResponse:
if params is None:
params = {}
k = params.get("max_chunks", 3)
score_threshold = params.get("score_threshold", 0.0)
def _process(c) -> str:
if isinstance(c, str):
return c
else:
return "<media>"
if isinstance(query, list):
query_str = " ".join([_process(c) for c in query])
else:
query_str = _process(query)
query_str = interleaved_content_as_str(query)
embeddings_response = await self.inference_api.embeddings(
self.bank.embedding_model, [query_str]
self.vector_db.embedding_model, [query_str]
)
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
return await self.index.query(query_vector, k, score_threshold)