Make embedding generation go through inference (#606)

This PR does the following:
1) adds the ability to generate embeddings in all supported inference
providers.
2) Moves all the memory providers to use the inference API and improved
the memory tests to setup the inference stack correctly and use the
embedding models

This is a merge from #589 and #598
This commit is contained in:
Dinesh Yeduguru 2024-12-12 11:47:50 -08:00 committed by GitHub
parent a14785af46
commit 96e158eaac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 677 additions and 156 deletions

View file

@ -45,12 +45,14 @@ def sample_documents():
]
async def register_memory_bank(banks_impl: MemoryBanks) -> MemoryBank:
async def register_memory_bank(
banks_impl: MemoryBanks, inference_model: str
) -> MemoryBank:
bank_id = f"test_bank_{uuid.uuid4().hex}"
return await banks_impl.register_memory_bank(
memory_bank_id=bank_id,
params=VectorMemoryBankParams(
embedding_model="all-MiniLM-L6-v2",
embedding_model=inference_model,
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
),
@ -59,11 +61,11 @@ async def register_memory_bank(banks_impl: MemoryBanks) -> MemoryBank:
class TestMemory:
@pytest.mark.asyncio
async def test_banks_list(self, memory_stack):
async def test_banks_list(self, memory_stack, inference_model):
_, banks_impl = memory_stack
# Register a test bank
registered_bank = await register_memory_bank(banks_impl)
registered_bank = await register_memory_bank(banks_impl, inference_model)
try:
# Verify our bank shows up in list
@ -84,7 +86,7 @@ class TestMemory:
)
@pytest.mark.asyncio
async def test_banks_register(self, memory_stack):
async def test_banks_register(self, memory_stack, inference_model):
_, banks_impl = memory_stack
bank_id = f"test_bank_{uuid.uuid4().hex}"
@ -94,7 +96,7 @@ class TestMemory:
await banks_impl.register_memory_bank(
memory_bank_id=bank_id,
params=VectorMemoryBankParams(
embedding_model="all-MiniLM-L6-v2",
embedding_model=inference_model,
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
),
@ -109,7 +111,7 @@ class TestMemory:
await banks_impl.register_memory_bank(
memory_bank_id=bank_id,
params=VectorMemoryBankParams(
embedding_model="all-MiniLM-L6-v2",
embedding_model=inference_model,
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
),
@ -126,13 +128,15 @@ class TestMemory:
await banks_impl.unregister_memory_bank(bank_id)
@pytest.mark.asyncio
async def test_query_documents(self, memory_stack, sample_documents):
async def test_query_documents(
self, memory_stack, inference_model, sample_documents
):
memory_impl, banks_impl = memory_stack
with pytest.raises(ValueError):
await memory_impl.insert_documents("test_bank", sample_documents)
registered_bank = await register_memory_bank(banks_impl)
registered_bank = await register_memory_bank(banks_impl, inference_model)
await memory_impl.insert_documents(
registered_bank.memory_bank_id, sample_documents
)
@ -165,13 +169,13 @@ class TestMemory:
# Test case 5: Query with threshold on similarity score
query5 = "quantum computing" # Not directly related to any document
params5 = {"score_threshold": 0.2}
params5 = {"score_threshold": 0.01}
response5 = await memory_impl.query_documents(
registered_bank.memory_bank_id, query5, params5
)
assert_valid_response(response5)
print("The scores are:", response5.scores)
assert all(score >= 0.2 for score in response5.scores)
assert all(score >= 0.01 for score in response5.scores)
def assert_valid_response(response: QueryDocumentsResponse):