implement embedding generation in supported inference providers (#589)

This PR adds the ability to generate embeddings in all supported
inference providers.

```
pytest -v -s llama_stack/providers/tests/inference/test_embeddings.py -k "bedrock" --inference-model="amazon.titan-embed-text-v2:0"  --env EMBEDDING_DIMENSION=1024

 pytest -v -s -k "vllm"  --inferrence-model="intfloat/e5-mistral-7b-instruct"  llama_stack/providers/tests/inference/test_embeddings.py --env EMBEDDING_DIMENSION=4096  --env VLLM_URL="http://localhost:9798/v1"

pytest -v -s --inference-model="nomic-ai/nomic-embed-text-v1.5"  llama_stack/providers/tests/inference/test_embeddings.py  -k "fireworks"  --env FIREWORKS_API_KEY=<API_KEY>--env EMBEDDING_DIMENSION=128

pytest -v -s --inference-model="togethercomputer/m2-bert-80M-2k-retrieval"  llama_stack/providers/tests/inference/test_embeddings.py  -k "together"  --env TOGETHER_API_KEY=<API_KEY>--env EMBEDDING_DIMENSION=768

pytest -v -s -k "ollama"  --inference-model="all-minilm:v8"  llama_stack/providers/tests/inference/test_embeddings.py --env EMBEDDING_DIMENSION=384

 torchrun $CONDA_PREFIX/bin/pytest -v -s -k "meta_reference" --inference-model="sentence-transformers/all-MiniLM-L6-v2"  llama_stack/providers/tests/inference/test_embeddings.py --env EMBEDDING_DIMENSION=384

```
This commit is contained in:
Dinesh Yeduguru 2024-12-12 11:17:39 -08:00
parent 6a23f24ee0
commit d362d2d740
32 changed files with 597 additions and 143 deletions

View file

@ -12,10 +12,11 @@ import weaviate
import weaviate.classes as wvc
from numpy.typing import NDArray
from weaviate.classes.init import Auth
from weaviate.classes.query import Filter
from llama_stack.apis.memory import * # noqa: F403
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.providers.datatypes import MemoryBanksProtocolPrivate
from llama_stack.providers.datatypes import Api, MemoryBanksProtocolPrivate
from llama_stack.providers.utils.memory.vector_store import (
BankWithIndex,
EmbeddingIndex,
@ -80,12 +81,21 @@ class WeaviateIndex(EmbeddingIndex):
return QueryDocumentsResponse(chunks=chunks, scores=scores)
async def delete(self, chunk_ids: List[str]) -> None:
collection = self.client.collections.get(self.collection_name)
collection.data.delete_many(
where=Filter.by_property("id").contains_any(chunk_ids)
)
class WeaviateMemoryAdapter(
Memory, NeedsRequestProviderData, MemoryBanksProtocolPrivate
Memory,
NeedsRequestProviderData,
MemoryBanksProtocolPrivate,
):
def __init__(self, config: WeaviateConfig) -> None:
def __init__(self, config: WeaviateConfig, inference_api: Api.inference) -> None:
self.config = config
self.inference_api = inference_api
self.client_cache = {}
self.cache = {}
@ -117,7 +127,7 @@ class WeaviateMemoryAdapter(
memory_bank: MemoryBank,
) -> None:
assert (
memory_bank.memory_bank_type == MemoryBankType.vector
memory_bank.memory_bank_type == MemoryBankType.vector.value
), f"Only vector banks are supported {memory_bank.memory_bank_type}"
client = self._get_client()
@ -135,11 +145,11 @@ class WeaviateMemoryAdapter(
],
)
index = BankWithIndex(
bank=memory_bank,
index=WeaviateIndex(client=client, collection_name=memory_bank.identifier),
self.cache[memory_bank.identifier] = BankWithIndex(
memory_bank,
WeaviateIndex(client=client, collection_name=memory_bank.identifier),
self.inference_api,
)
self.cache[memory_bank.identifier] = index
async def _get_and_cache_bank_index(self, bank_id: str) -> Optional[BankWithIndex]:
if bank_id in self.cache:
@ -156,6 +166,7 @@ class WeaviateMemoryAdapter(
index = BankWithIndex(
bank=bank,
index=WeaviateIndex(client=client, collection_name=bank_id),
inference_api=self.inference_api,
)
self.cache[bank_id] = index
return index