mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-18 02:42:31 +00:00
[memory refactor][2/n] Update faiss and make it pass tests (#830)
See https://github.com/meta-llama/llama-stack/issues/827 for the broader design. Second part: - updates routing table / router code - updates the faiss implementation ## Test Plan ``` pytest -s -v -k sentence test_vector_io.py --env EMBEDDING_DIMENSION=384 ```
This commit is contained in:
parent
3ae8585b65
commit
78a481bb22
19 changed files with 343 additions and 353 deletions
|
@ -18,6 +18,8 @@ import numpy as np
|
|||
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from numpy.typing import NDArray
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pypdf import PdfReader
|
||||
|
||||
from llama_stack.apis.common.content_types import (
|
||||
|
@ -25,16 +27,24 @@ from llama_stack.apis.common.content_types import (
|
|||
TextContentItem,
|
||||
URL,
|
||||
)
|
||||
from llama_stack.apis.memory import Chunk, MemoryBankDocument, QueryDocumentsResponse
|
||||
from llama_stack.apis.memory_banks import VectorMemoryBank
|
||||
from llama_stack.apis.vector_dbs import VectorDB
|
||||
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
|
||||
from llama_stack.providers.datatypes import Api
|
||||
from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||
interleaved_content_as_str,
|
||||
)
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryBankDocument(BaseModel):
|
||||
document_id: str
|
||||
content: InterleavedContent | URL
|
||||
mime_type: str | None = None
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
def parse_pdf(data: bytes) -> str:
|
||||
# For PDF and DOC/DOCX files, we can't reliably convert to string
|
||||
pdf_bytes = io.BytesIO(data)
|
||||
|
@ -165,7 +175,7 @@ class EmbeddingIndex(ABC):
|
|||
@abstractmethod
|
||||
async def query(
|
||||
self, embedding: NDArray, k: int, score_threshold: float
|
||||
) -> QueryDocumentsResponse:
|
||||
) -> QueryChunksResponse:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
|
@ -174,56 +184,35 @@ class EmbeddingIndex(ABC):
|
|||
|
||||
|
||||
@dataclass
|
||||
class BankWithIndex:
|
||||
bank: VectorMemoryBank
|
||||
class VectorDBWithIndex:
|
||||
vector_db: VectorDB
|
||||
index: EmbeddingIndex
|
||||
inference_api: Api.inference
|
||||
|
||||
async def insert_documents(
|
||||
async def insert_chunks(
|
||||
self,
|
||||
documents: List[MemoryBankDocument],
|
||||
chunks: List[Chunk],
|
||||
) -> None:
|
||||
for doc in documents:
|
||||
content = await content_from_doc(doc)
|
||||
chunks = make_overlapped_chunks(
|
||||
doc.document_id,
|
||||
content,
|
||||
self.bank.chunk_size_in_tokens,
|
||||
self.bank.overlap_size_in_tokens
|
||||
or (self.bank.chunk_size_in_tokens // 4),
|
||||
)
|
||||
if not chunks:
|
||||
continue
|
||||
embeddings_response = await self.inference_api.embeddings(
|
||||
self.bank.embedding_model, [x.content for x in chunks]
|
||||
)
|
||||
embeddings = np.array(embeddings_response.embeddings)
|
||||
embeddings_response = await self.inference_api.embeddings(
|
||||
self.vector_db.embedding_model, [x.content for x in chunks]
|
||||
)
|
||||
embeddings = np.array(embeddings_response.embeddings)
|
||||
|
||||
await self.index.add_chunks(chunks, embeddings)
|
||||
await self.index.add_chunks(chunks, embeddings)
|
||||
|
||||
async def query_documents(
|
||||
async def query_chunks(
|
||||
self,
|
||||
query: InterleavedContent,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
) -> QueryDocumentsResponse:
|
||||
) -> QueryChunksResponse:
|
||||
if params is None:
|
||||
params = {}
|
||||
k = params.get("max_chunks", 3)
|
||||
score_threshold = params.get("score_threshold", 0.0)
|
||||
|
||||
def _process(c) -> str:
|
||||
if isinstance(c, str):
|
||||
return c
|
||||
else:
|
||||
return "<media>"
|
||||
|
||||
if isinstance(query, list):
|
||||
query_str = " ".join([_process(c) for c in query])
|
||||
else:
|
||||
query_str = _process(query)
|
||||
|
||||
query_str = interleaved_content_as_str(query)
|
||||
embeddings_response = await self.inference_api.embeddings(
|
||||
self.bank.embedding_model, [query_str]
|
||||
self.vector_db.embedding_model, [query_str]
|
||||
)
|
||||
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
|
||||
return await self.index.query(query_vector, k, score_threshold)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue