refactor(test): unify vector_io tests and make them configurable (#1398)

## Test Plan `LLAMA_STACK_CONFIG=inference=sentence-transformers,vector_io=sqlite-vec pytest -s -v test_vector_io.py --embedding-model all-miniLM-L6-V2 --inference-model='' --vision-inference-model=''` ``` test_vector_io.py::test_vector_db_retrieve[txt=:vis=:emb=all-miniLM-L6-V2] PASSED test_vector_io.py::test_vector_db_register[txt=:vis=:emb=all-miniLM-L6-V2] PASSED test_vector_io.py::test_insert_chunks[txt=:vis=:emb=all-miniLM-L6-V2-test_case0] PASSED test_vector_io.py::test_insert_chunks[txt=:vis=:emb=all-miniLM-L6-V2-test_case1] PASSED test_vector_io.py::test_insert_chunks[txt=:vis=:emb=all-miniLM-L6-V2-test_case2] PASSED test_vector_io.py::test_insert_chunks[txt=:vis=:emb=all-miniLM-L6-V2-test_case3] PASSED test_vector_io.py::test_insert_chunks[txt=:vis=:emb=all-miniLM-L6-V2-test_case4] PASSED ``` Same thing with: - LLAMA_STACK_CONFIG=inference=sentence-transformers,vector_io=faiss - LLAMA_STACK_CONFIG=fireworks (Note that ergonomics will soon be improved re: cmd-line options and env variables)
2025-12-03 09:53:45 +00:00 · 2025-03-04 13:37:45 -08:00 · 2025-03-04 13:37:45 -08:00 · dd0db8038b
commit dd0db8038b
parent fd8c991393
27 changed files with 117 additions and 559 deletions
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -128,6 +128,7 @@ def distro_from_adhoc_config_spec(adhoc_config_spec: str) -> str:
    api_providers = adhoc_config_spec.replace(";", ",").split(",")
    provider_registry = get_provider_registry()

+    distro_dir = tempfile.mkdtemp()
    provider_configs_by_api = {}
    for api_provider in api_providers:
        api_str, provider = api_provider.split("=")
@ -147,7 +148,7 @@ def distro_from_adhoc_config_spec(adhoc_config_spec: str) -> str:

        # call method "sample_run_config" on the provider spec config class
        provider_config_type = instantiate_class_type(provider_spec.config_class)
-        provider_config = replace_env_vars(provider_config_type.sample_run_config())
+        provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))

        provider_configs_by_api[api_str] = [
            Provider(
--- a/tests/integration/vector_io/test_vector_io.py
+++ b/tests/integration/vector_io/test_vector_io.py
@ -4,83 +4,119 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import random
-
 import pytest

-INLINE_VECTOR_DB_PROVIDERS = [
-    "faiss",
-    # TODO: add sqlite_vec to templates
-    # "sqlite_vec",
-]
+from llama_stack.apis.vector_io import Chunk
+
+
+@pytest.fixture(scope="session")
+def sample_chunks():
+    return [
+        Chunk(
+            content="Python is a high-level programming language that emphasizes code readability and allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java.",
+            metadata={"document_id": "doc1"},
+        ),
+        Chunk(
+            content="Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed, using statistical techniques to give computer systems the ability to progressively improve performance on a specific task.",
+            metadata={"document_id": "doc2"},
+        ),
+        Chunk(
+            content="Data structures are fundamental to computer science because they provide organized ways to store and access data efficiently, enable faster processing of data through optimized algorithms, and form the building blocks for more complex software systems.",
+            metadata={"document_id": "doc3"},
+        ),
+        Chunk(
+            content="Neural networks are inspired by biological neural networks found in animal brains, using interconnected nodes called artificial neurons to process information through weighted connections that can be trained to recognize patterns and solve complex problems through iterative learning.",
+            metadata={"document_id": "doc4"},
+        ),
+    ]


@pytest.fixture(scope="function")
-def empty_vector_db_registry(llama_stack_client):
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    for vector_db_id in vector_dbs:
-        llama_stack_client.vector_dbs.unregister(vector_db_id=vector_db_id)
+def client_with_empty_registry(client_with_models):
+    def clear_registry():
+        vector_dbs = [vector_db.identifier for vector_db in client_with_models.vector_dbs.list()]
+        for vector_db_id in vector_dbs:
+            client_with_models.vector_dbs.unregister(vector_db_id=vector_db_id)
+
+    clear_registry()
+    yield client_with_models
+
+    # you must clean after the last test if you were running tests against
+    # a stateful server instance
+    clear_registry()


-@pytest.fixture(scope="function")
-def single_entry_vector_db_registry(llama_stack_client, empty_vector_db_registry, provider_id):
-    vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}"
-    llama_stack_client.vector_dbs.register(
-        vector_db_id=vector_db_id,
-        embedding_model="all-MiniLM-L6-v2",
-        embedding_dimension=384,
-        provider_id=provider_id,
-    )
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    return vector_dbs
-
-
-@pytest.mark.parametrize("provider_id", INLINE_VECTOR_DB_PROVIDERS)
-def test_vector_db_retrieve(llama_stack_client, embedding_model_id, empty_vector_db_registry, provider_id):
+def test_vector_db_retrieve(client_with_empty_registry, embedding_model_id):
    # Register a memory bank first
-    vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}"
-    llama_stack_client.vector_dbs.register(
+    vector_db_id = "test_vector_db"
+    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=384,
-        provider_id=provider_id,
    )

    # Retrieve the memory bank and validate its properties
-    response = llama_stack_client.vector_dbs.retrieve(vector_db_id=vector_db_id)
+    response = client_with_empty_registry.vector_dbs.retrieve(vector_db_id=vector_db_id)
    assert response is not None
    assert response.identifier == vector_db_id
    assert response.embedding_model == embedding_model_id
-    assert response.provider_id == provider_id
    assert response.provider_resource_id == vector_db_id


-def test_vector_db_list(llama_stack_client, empty_vector_db_registry):
-    vector_dbs_after_register = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    assert len(vector_dbs_after_register) == 0
-
-
-@pytest.mark.parametrize("provider_id", INLINE_VECTOR_DB_PROVIDERS)
-def test_vector_db_register(llama_stack_client, embedding_model_id, empty_vector_db_registry, provider_id):
-    vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}"
-    llama_stack_client.vector_dbs.register(
+def test_vector_db_register(client_with_empty_registry, embedding_model_id):
+    vector_db_id = "test_vector_db"
+    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=384,
-        provider_id=provider_id,
    )

-    vector_dbs_after_register = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
+    vector_dbs_after_register = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    assert vector_dbs_after_register == [vector_db_id]

+    client_with_empty_registry.vector_dbs.unregister(vector_db_id=vector_db_id)

-@pytest.mark.parametrize("provider_id", INLINE_VECTOR_DB_PROVIDERS)
-def test_vector_db_unregister(llama_stack_client, single_entry_vector_db_registry, provider_id):
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    assert len(vector_dbs) == 1
-
-    vector_db_id = vector_dbs[0]
-    llama_stack_client.vector_dbs.unregister(vector_db_id=vector_db_id)
-
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
+    vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    assert len(vector_dbs) == 0
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        ("What makes Python different from C++ and Java?", "doc1"),
+        ("How do systems learn without explicit programming?", "doc2"),
+        ("Why are data structures important in computer science?", "doc3"),
+        ("What is the biological inspiration for neural networks?", "doc4"),
+        ("How does machine learning improve over time?", "doc2"),
+    ],
+)
+def test_insert_chunks(client_with_empty_registry, embedding_model_id, sample_chunks, test_case):
+    vector_db_id = "test_vector_db"
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=384,
+    )
+
+    client_with_empty_registry.vector_io.insert(
+        vector_db_id=vector_db_id,
+        chunks=sample_chunks,
+    )
+
+    response = client_with_empty_registry.vector_io.query(
+        vector_db_id=vector_db_id,
+        query="What is the capital of France?",
+    )
+    assert response is not None
+    assert len(response.chunks) > 1
+    assert len(response.scores) > 1
+
+    query, expected_doc_id = test_case
+    response = client_with_empty_registry.vector_io.query(
+        vector_db_id=vector_db_id,
+        query=query,
+    )
+    assert response is not None
+    top_match = response.chunks[0]
+    assert top_match is not None
+    assert top_match.metadata["document_id"] == expected_doc_id, f"Query '{query}' should match {expected_doc_id}"
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@ -0,0 +1,134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import sqlite3
+
+import numpy as np
+import pytest
+import sqlite_vec
+
+from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
+from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
+    SQLiteVecIndex,
+    SQLiteVecVectorIOAdapter,
+    generate_chunk_id,
+)
+
+# This test is a unit test for the SQLiteVecVectorIOAdapter class. This should only contain
+# tests which are specific to this class. More general (API-level) tests should be placed in
+# tests/integration/vector_io/
+#
+# How to run this test:
+#
+# pytest tests/unit/providers/vector_io/test_sqlite_vec.py \
+# -v -s --tb=short --disable-warnings --asyncio-mode=auto
+
+SQLITE_VEC_PROVIDER = "sqlite_vec"
+EMBEDDING_DIMENSION = 384
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+
+
+@pytest.fixture(scope="session")
+def loop():
+    return asyncio.new_event_loop()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def sqlite_connection(loop):
+    conn = sqlite3.connect(":memory:")
+    try:
+        conn.enable_load_extension(True)
+        sqlite_vec.load(conn)
+        yield conn
+    finally:
+        conn.close()
+
+
+@pytest.fixture(scope="session", autouse=True)
+async def sqlite_vec_index(sqlite_connection):
+    return await SQLiteVecIndex.create(dimension=EMBEDDING_DIMENSION, connection=sqlite_connection, bank_id="test_bank")
+
+
+@pytest.fixture(scope="session")
+def sample_chunks():
+    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
+    n, k = 10, 3
+    sample = [
+        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
+        for j in range(k)
+        for i in range(n)
+    ]
+    return sample
+
+
+@pytest.fixture(scope="session")
+def sample_embeddings(sample_chunks):
+    np.random.seed(42)
+    return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
+
+
+@pytest.mark.asyncio
+async def test_add_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings, batch_size=2)
+    cur = sqlite_vec_index.connection.cursor()
+    cur.execute(f"SELECT COUNT(*) FROM {sqlite_vec_index.metadata_table}")
+    count = cur.fetchone()[0]
+    assert count == len(sample_chunks)
+
+
+@pytest.mark.asyncio
+async def test_query_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+    query_embedding = np.random.rand(EMBEDDING_DIMENSION).astype(np.float32)
+    response = await sqlite_vec_index.query(query_embedding, k=2, score_threshold=0.0)
+    assert isinstance(response, QueryChunksResponse)
+    assert len(response.chunks) == 2
+
+
+@pytest.mark.asyncio
+async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks):
+    """Test that chunk IDs do not conflict across batches when inserting chunks."""
+    # Reduce batch size to force multiple batches for same document
+    # since there are 10 chunks per document and batch size is 2
+    batch_size = 2
+    sample_embeddings = np.random.rand(len(sample_chunks), EMBEDDING_DIMENSION).astype(np.float32)
+
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings, batch_size=batch_size)
+
+    cur = sqlite_vec_index.connection.cursor()
+
+    # Retrieve all chunk IDs to check for duplicates
+    cur.execute(f"SELECT id FROM {sqlite_vec_index.metadata_table}")
+    chunk_ids = [row[0] for row in cur.fetchall()]
+    cur.close()
+
+    # Ensure all chunk IDs are unique
+    assert len(chunk_ids) == len(set(chunk_ids)), "Duplicate chunk IDs detected across batches!"
+
+
+@pytest.fixture(scope="session")
+async def sqlite_vec_adapter(sqlite_connection):
+    config = type("Config", (object,), {"db_path": ":memory:"})  # Mock config with in-memory database
+    adapter = SQLiteVecVectorIOAdapter(config=config, inference_api=None)
+    await adapter.initialize()
+    yield adapter
+    await adapter.shutdown()
+
+
+def test_generate_chunk_id():
+    chunks = [
+        Chunk(content="test", metadata={"document_id": "doc-1"}),
+        Chunk(content="test ", metadata={"document_id": "doc-1"}),
+        Chunk(content="test 3", metadata={"document_id": "doc-1"}),
+    ]
+
+    chunk_ids = sorted([generate_chunk_id(chunk.metadata["document_id"], chunk.content) for chunk in chunks])
+    assert chunk_ids == [
+        "177a1368-f6a8-0c50-6e92-18677f2c3de3",
+        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
+        "f68df25d-d9aa-ab4d-5684-64a233add20d",
+    ]