mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-18 17:09:48 +00:00
user inference api to generate embeddings in vector store
This commit is contained in:
parent
96accc1216
commit
5bbeb985ca
12 changed files with 134 additions and 96 deletions
|
|
@ -22,28 +22,10 @@ from llama_models.llama3.api.datatypes import * # noqa: F403
|
|||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
|
||||
from llama_stack.apis.memory import * # noqa: F403
|
||||
from llama_stack.providers.datatypes import Api
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
ALL_MINILM_L6_V2_DIMENSION = 384
|
||||
|
||||
EMBEDDING_MODELS = {}
|
||||
|
||||
|
||||
def get_embedding_model(model: str) -> "SentenceTransformer":
|
||||
global EMBEDDING_MODELS
|
||||
|
||||
loaded_model = EMBEDDING_MODELS.get(model)
|
||||
if loaded_model is not None:
|
||||
return loaded_model
|
||||
|
||||
log.info(f"Loading sentence transformer for {model}...")
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
loaded_model = SentenceTransformer(model)
|
||||
EMBEDDING_MODELS[model] = loaded_model
|
||||
return loaded_model
|
||||
|
||||
|
||||
def parse_pdf(data: bytes) -> str:
|
||||
# For PDF and DOC/DOCX files, we can't reliably convert to string
|
||||
|
|
@ -166,12 +148,12 @@ class EmbeddingIndex(ABC):
|
|||
class BankWithIndex:
|
||||
bank: VectorMemoryBank
|
||||
index: EmbeddingIndex
|
||||
inference_api: Api.inference
|
||||
|
||||
async def insert_documents(
|
||||
self,
|
||||
documents: List[MemoryBankDocument],
|
||||
) -> None:
|
||||
model = get_embedding_model(self.bank.embedding_model)
|
||||
for doc in documents:
|
||||
content = await content_from_doc(doc)
|
||||
chunks = make_overlapped_chunks(
|
||||
|
|
@ -183,7 +165,10 @@ class BankWithIndex:
|
|||
)
|
||||
if not chunks:
|
||||
continue
|
||||
embeddings = model.encode([x.content for x in chunks]).astype(np.float32)
|
||||
embeddings_response = await self.inference_api.embeddings(
|
||||
self.bank.embedding_model, [x.content for x in chunks]
|
||||
)
|
||||
embeddings = np.array(embeddings_response.embeddings)
|
||||
|
||||
await self.index.add_chunks(chunks, embeddings)
|
||||
|
||||
|
|
@ -208,6 +193,25 @@ class BankWithIndex:
|
|||
else:
|
||||
query_str = _process(query)
|
||||
|
||||
model = get_embedding_model(self.bank.embedding_model)
|
||||
query_vector = model.encode([query_str])[0].astype(np.float32)
|
||||
embeddings_response = await self.inference_api.embeddings(
|
||||
self.bank.embedding_model, [query_str]
|
||||
)
|
||||
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
|
||||
return await self.index.query(query_vector, k, score_threshold)
|
||||
|
||||
|
||||
class InferenceEmbeddingMixin:
|
||||
inference_api: Api.inference
|
||||
|
||||
def __init__(self, inference_api: Api.inference):
|
||||
self.inference_api = inference_api
|
||||
|
||||
def _create_bank_with_index(
|
||||
self, bank: VectorMemoryBank, index: EmbeddingIndex
|
||||
) -> BankWithIndex:
|
||||
|
||||
return BankWithIndex(
|
||||
bank=bank,
|
||||
index=index,
|
||||
inference_api=self.inference_api,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue