Make embedding generation go through inference (#606)

This PR does the following: 1) adds the ability to generate embeddings in all supported inference providers. 2) Moves all the memory providers to use the inference API and improved the memory tests to setup the inference stack correctly and use the embedding models This is a merge from #589 and #598
2025-12-04 10:10:36 +00:00 · 2024-12-12 11:47:50 -08:00 · 2024-12-12 11:47:50 -08:00 · 96e158eaac
commit 96e158eaac
parent a14785af46
37 changed files with 677 additions and 156 deletions
--- a/llama_stack/providers/tests/memory/fixtures.py
+++ b/llama_stack/providers/tests/memory/fixtures.py
@ -10,6 +10,8 @@ import tempfile
 import pytest
 import pytest_asyncio

+from llama_stack.apis.inference import ModelInput, ModelType
+
 from llama_stack.distribution.datatypes import Api, Provider
 from llama_stack.providers.inline.memory.chroma import ChromaInlineImplConfig
 from llama_stack.providers.inline.memory.faiss import FaissImplConfig
@ -105,14 +107,30 @@ MEMORY_FIXTURES = ["faiss", "pgvector", "weaviate", "remote", "chroma"]


@pytest_asyncio.fixture(scope="session")
-async def memory_stack(request):
-    fixture_name = request.param
-    fixture = request.getfixturevalue(f"memory_{fixture_name}")
+async def memory_stack(inference_model, request):
+    fixture_dict = request.param
+
+    providers = {}
+    provider_data = {}
+    for key in ["inference", "memory"]:
+        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
+        providers[key] = fixture.providers
+        if fixture.provider_data:
+            provider_data.update(fixture.provider_data)

    test_stack = await construct_stack_for_test(
-        [Api.memory],
-        {"memory": fixture.providers},
-        fixture.provider_data,
+        [Api.memory, Api.inference],
+        providers,
+        provider_data,
+        models=[
+            ModelInput(
+                model_id=inference_model,
+                model_type=ModelType.embedding_model,
+                metadata={
+                    "embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"),
+                },
+            )
+        ],
    )

    return test_stack.impls[Api.memory], test_stack.impls[Api.memory_banks]