Make embedding generation go through inference (#606)

This PR does the following: 1) adds the ability to generate embeddings in all supported inference providers. 2) Moves all the memory providers to use the inference API and improved the memory tests to setup the inference stack correctly and use the embedding models This is a merge from #589 and #598
2024-12-12 11:47:50 -08:00 · 2024-12-12 11:47:50 -08:00 · 96e158eaac
commit 96e158eaac
parent a14785af46
37 changed files with 677 additions and 156 deletions
--- a/llama_stack/providers/tests/memory/test_memory.py
+++ b/llama_stack/providers/tests/memory/test_memory.py
@ -45,12 +45,14 @@ def sample_documents():
    ]


-async def register_memory_bank(banks_impl: MemoryBanks) -> MemoryBank:
+async def register_memory_bank(
+    banks_impl: MemoryBanks, inference_model: str
+) -> MemoryBank:
    bank_id = f"test_bank_{uuid.uuid4().hex}"
    return await banks_impl.register_memory_bank(
        memory_bank_id=bank_id,
        params=VectorMemoryBankParams(
-            embedding_model="all-MiniLM-L6-v2",
+            embedding_model=inference_model,
            chunk_size_in_tokens=512,
            overlap_size_in_tokens=64,
        ),
@ -59,11 +61,11 @@ async def register_memory_bank(banks_impl: MemoryBanks) -> MemoryBank:

 class TestMemory:
    @pytest.mark.asyncio
-    async def test_banks_list(self, memory_stack):
+    async def test_banks_list(self, memory_stack, inference_model):
        _, banks_impl = memory_stack

        # Register a test bank
-        registered_bank = await register_memory_bank(banks_impl)
+        registered_bank = await register_memory_bank(banks_impl, inference_model)

        try:
            # Verify our bank shows up in list
@ -84,7 +86,7 @@ class TestMemory:
        )

    @pytest.mark.asyncio
-    async def test_banks_register(self, memory_stack):
+    async def test_banks_register(self, memory_stack, inference_model):
        _, banks_impl = memory_stack

        bank_id = f"test_bank_{uuid.uuid4().hex}"
@ -94,7 +96,7 @@ class TestMemory:
            await banks_impl.register_memory_bank(
                memory_bank_id=bank_id,
                params=VectorMemoryBankParams(
-                    embedding_model="all-MiniLM-L6-v2",
+                    embedding_model=inference_model,
                    chunk_size_in_tokens=512,
                    overlap_size_in_tokens=64,
                ),
@ -109,7 +111,7 @@ class TestMemory:
            await banks_impl.register_memory_bank(
                memory_bank_id=bank_id,
                params=VectorMemoryBankParams(
-                    embedding_model="all-MiniLM-L6-v2",
+                    embedding_model=inference_model,
                    chunk_size_in_tokens=512,
                    overlap_size_in_tokens=64,
                ),
@ -126,13 +128,15 @@ class TestMemory:
            await banks_impl.unregister_memory_bank(bank_id)

    @pytest.mark.asyncio
-    async def test_query_documents(self, memory_stack, sample_documents):
+    async def test_query_documents(
+        self, memory_stack, inference_model, sample_documents
+    ):
        memory_impl, banks_impl = memory_stack

        with pytest.raises(ValueError):
            await memory_impl.insert_documents("test_bank", sample_documents)

-        registered_bank = await register_memory_bank(banks_impl)
+        registered_bank = await register_memory_bank(banks_impl, inference_model)
        await memory_impl.insert_documents(
            registered_bank.memory_bank_id, sample_documents
        )
@ -165,13 +169,13 @@ class TestMemory:

        # Test case 5: Query with threshold on similarity score
        query5 = "quantum computing"  # Not directly related to any document
-        params5 = {"score_threshold": 0.2}
+        params5 = {"score_threshold": 0.01}
        response5 = await memory_impl.query_documents(
            registered_bank.memory_bank_id, query5, params5
        )
        assert_valid_response(response5)
        print("The scores are:", response5.scores)
-        assert all(score >= 0.2 for score in response5.scores)
+        assert all(score >= 0.01 for score in response5.scores)


 def assert_valid_response(response: QueryDocumentsResponse):