implement embedding generation in supported inference providers

2025-12-18 03:19:52 +00:00 · 2024-12-09 12:48:56 -08:00 · 2024-12-09 12:48:56 -08:00 · e167e9eb93
commit e167e9eb93
parent b896be2311
16 changed files with 383 additions and 29 deletions
--- a/llama_stack/providers/tests/inference/conftest.py
+++ b/llama_stack/providers/tests/inference/conftest.py
@ -18,6 +18,12 @@ def pytest_addoption(parser):
        default=None,
        help="Specify the inference model to use for testing",
    )
+    parser.addoption(
+        "--embedding-model",
+        action="store",
+        default=None,
+        help="Specify the embedding model to use for testing",
+    )


 def pytest_configure(config):
@ -78,3 +84,24 @@ def pytest_generate_tests(metafunc):
        ):
            fixtures = [stack.values[0]["inference"] for stack in filtered_stacks]
        metafunc.parametrize("inference_stack", fixtures, indirect=True)
+
+    if "embedding_model" in metafunc.fixturenames:
+        model = metafunc.config.getoption("--embedding-model")
+        if not model:
+            raise ValueError(
+                "No embedding model specified. Please provide a valid embedding model."
+            )
+        params = [pytest.param(model, id="")]
+
+        metafunc.parametrize("embedding_model", params, indirect=True)
+
+    if "embedding_stack" in metafunc.fixturenames:
+        fixtures = INFERENCE_FIXTURES
+        if filtered_stacks := get_provider_fixture_overrides(
+            metafunc.config,
+            {
+                "inference": INFERENCE_FIXTURES,
+            },
+        ):
+            fixtures = [stack.values[0]["inference"] for stack in filtered_stacks]
+        metafunc.parametrize("embedding_stack", fixtures, indirect=True)
--- a/llama_stack/providers/tests/inference/fixtures.py
+++ b/llama_stack/providers/tests/inference/fixtures.py
@ -9,9 +9,9 @@ import os
 import pytest
 import pytest_asyncio

-from llama_stack.apis.models import ModelInput
-
+from llama_stack.apis.models import ModelInput, ModelType
 from llama_stack.distribution.datatypes import Api, Provider
+
 from llama_stack.providers.inline.inference.meta_reference import (
    MetaReferenceInferenceConfig,
 )
@ -37,6 +37,13 @@ def inference_model(request):
    return request.config.getoption("--inference-model", None)


+@pytest.fixture(scope="session")
+def embedding_model(request):
+    if hasattr(request, "param"):
+        return request.param
+    return request.config.getoption("--embedding-model", None)
+
+
@pytest.fixture(scope="session")
 def inference_remote() -> ProviderFixture:
    return remote_stack_fixture()
@ -85,7 +92,7 @@ def inference_ollama(inference_model) -> ProviderFixture:
    inference_model = (
        [inference_model] if isinstance(inference_model, str) else inference_model
    )
-    if "Llama3.1-8B-Instruct" in inference_model:
+    if inference_model and "Llama3.1-8B-Instruct" in inference_model:
        pytest.skip("Ollama only supports Llama3.2-3B-Instruct for testing")

    return ProviderFixture(
@ -240,3 +247,25 @@ async def inference_stack(request, inference_model):
    )

    return test_stack.impls[Api.inference], test_stack.impls[Api.models]
+
+
+@pytest_asyncio.fixture(scope="session")
+async def embedding_stack(request, embedding_model):
+    fixture_name = request.param
+    inference_fixture = request.getfixturevalue(f"inference_{fixture_name}")
+    test_stack = await construct_stack_for_test(
+        [Api.inference],
+        {"inference": inference_fixture.providers},
+        inference_fixture.provider_data,
+        models=[
+            ModelInput(
+                model_id=embedding_model,
+                model_type=ModelType.embedding_model,
+                metadata={
+                    "embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"),
+                },
+            )
+        ],
+    )
+
+    return test_stack.impls[Api.inference], test_stack.impls[Api.models]
--- a/llama_stack/providers/tests/inference/test_embeddings.py
+++ b/llama_stack/providers/tests/inference/test_embeddings.py
@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+from llama_stack.apis.inference import EmbeddingsResponse, ModelType
+
+# How to run this test:
+# pytest -v -s llama_stack/providers/tests/inference/test_embeddings.py
+
+
+class TestEmbeddings:
+    @pytest.mark.asyncio
+    async def test_embeddings(self, embedding_model, embedding_stack):
+        inference_impl, models_impl = embedding_stack
+        model = await models_impl.get_model(embedding_model)
+
+        if model.model_type != ModelType.embedding_model:
+            pytest.skip("This test is only applicable for embedding models")
+
+        response = await inference_impl.embeddings(
+            model_id=embedding_model,
+            contents=["Hello, world!"],
+        )
+        assert isinstance(response, EmbeddingsResponse)
+        assert len(response.embeddings) > 0
+        assert all(isinstance(embedding, list) for embedding in response.embeddings)
+        assert all(
+            isinstance(value, float)
+            for embedding in response.embeddings
+            for value in embedding
+        )
+
+    @pytest.mark.asyncio
+    async def test_batch_embeddings(self, embedding_model, embedding_stack):
+        inference_impl, models_impl = embedding_stack
+        model = await models_impl.get_model(embedding_model)
+
+        if model.model_type != ModelType.embedding_model:
+            pytest.skip("This test is only applicable for embedding models")
+
+        texts = ["Hello, world!", "This is a test", "Testing embeddings"]
+
+        response = await inference_impl.embeddings(
+            model_id=embedding_model,
+            contents=texts,
+        )
+
+        assert isinstance(response, EmbeddingsResponse)
+        assert len(response.embeddings) == len(texts)
+        assert all(isinstance(embedding, list) for embedding in response.embeddings)
+        assert all(
+            isinstance(value, float)
+            for embedding in response.embeddings
+            for value in embedding
+        )
+
+        embedding_dim = len(response.embeddings[0])
+        assert all(len(embedding) == embedding_dim for embedding in response.embeddings)