From 554c78ba6620b922a5eb2b4eae81ae7c7cdb471e Mon Sep 17 00:00:00 2001 From: kimbwook Date: Wed, 6 Aug 2025 14:51:10 +0900 Subject: [PATCH 1/4] add delete_chunk feature at chroma --- llama_stack/providers/remote/vector_io/chroma/chroma.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 26aeaedfb..442e64f5d 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -10,6 +10,7 @@ from typing import Any from urllib.parse import urlparse import chromadb +from chromadb.api.models.AsyncCollection import AsyncCollection from numpy.typing import NDArray from llama_stack.apis.files import Files @@ -116,7 +117,10 @@ class ChromaIndex(EmbeddingIndex): raise NotImplementedError("Keyword search is not supported in Chroma") async def delete_chunk(self, chunk_id: str) -> None: - raise NotImplementedError("delete_chunk is not supported in Chroma") + if isinstance(self.collection, AsyncCollection): + await self.collection.delete([chunk_id]) + else: + self.collection.delete([chunk_id]) async def query_hybrid( self, From 26fb2088771e9f88a61347ee2af9ee26a2b42b64 Mon Sep 17 00:00:00 2001 From: kimbwook Date: Thu, 7 Aug 2025 10:09:29 +0900 Subject: [PATCH 2/4] add query_keyword function --- .../remote/vector_io/chroma/chroma.py | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 442e64f5d..954817837 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -10,7 +10,6 @@ from typing import Any from urllib.parse import urlparse import chromadb -from chromadb.api.models.AsyncCollection import AsyncCollection from numpy.typing import NDArray from llama_stack.apis.files import Files @@ -109,18 +108,46 @@ class ChromaIndex(EmbeddingIndex): await maybe_await(self.client.delete_collection(self.collection.name)) async def query_keyword( - self, - query_string: str, - k: int, - score_threshold: float, + self, + query_string: str, + k: int, + score_threshold: float, ) -> QueryChunksResponse: - raise NotImplementedError("Keyword search is not supported in Chroma") + results = await maybe_await( + self.collection.query( + query_texts=[query_string], + where_document={"$contains": query_string}, + n_results=k, + include=["documents", "distances"], + ) + ) + + distances = results["distances"][0] if results["distances"] else [] + documents = results["documents"][0] if results["documents"] else [] + + chunks = [] + scores = [] + + for dist, doc in zip(distances, documents, strict=False): + try: + doc_data = json.loads(doc) + chunk = Chunk(**doc_data) + except Exception: + log.exception(f"Failed to parse document: {doc}") + continue + + score = 1.0 / (1.0 + float(dist)) if dist is not None else 1.0 + + if score < score_threshold: + continue + + chunks.append(chunk) + scores.append(score) + + return QueryChunksResponse(chunks=chunks, scores=scores) async def delete_chunk(self, chunk_id: str) -> None: - if isinstance(self.collection, AsyncCollection): - await self.collection.delete([chunk_id]) - else: - self.collection.delete([chunk_id]) + await maybe_await(self.collection.delete([chunk_id])) async def query_hybrid( self, From abd456232f6b0f730e05aa6074787828753beb79 Mon Sep 17 00:00:00 2001 From: kimbwook Date: Thu, 7 Aug 2025 10:19:44 +0900 Subject: [PATCH 3/4] apply pre-commit --- llama_stack/providers/remote/vector_io/chroma/chroma.py | 8 ++++---- tests/integration/vector_io/test_openai_vector_stores.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 954817837..75226a560 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -108,10 +108,10 @@ class ChromaIndex(EmbeddingIndex): await maybe_await(self.client.delete_collection(self.collection.name)) async def query_keyword( - self, - query_string: str, - k: int, - score_threshold: float, + self, + query_string: str, + k: int, + score_threshold: float, ) -> QueryChunksResponse: results = await maybe_await( self.collection.query( diff --git a/tests/integration/vector_io/test_openai_vector_stores.py b/tests/integration/vector_io/test_openai_vector_stores.py index 1c9ef92b6..0a5409ad9 100644 --- a/tests/integration/vector_io/test_openai_vector_stores.py +++ b/tests/integration/vector_io/test_openai_vector_stores.py @@ -52,6 +52,7 @@ def skip_if_provider_doesnt_support_openai_vector_stores_search(client_with_mode ], "keyword": [ "inline::sqlite-vec", + "remote::chromadb", ], "hybrid": [ "inline::sqlite-vec", From 5f2de49912169e2af3cb9d16a5d1e4f7f6a1ea8b Mon Sep 17 00:00:00 2001 From: kimbwook Date: Thu, 7 Aug 2025 10:39:32 +0900 Subject: [PATCH 4/4] add test code --- .../vector_io/test_openai_vector_stores.py | 1 - .../providers/vector_io/remote/test_chroma.py | 124 ++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tests/unit/providers/vector_io/remote/test_chroma.py diff --git a/tests/integration/vector_io/test_openai_vector_stores.py b/tests/integration/vector_io/test_openai_vector_stores.py index 0a5409ad9..1c9ef92b6 100644 --- a/tests/integration/vector_io/test_openai_vector_stores.py +++ b/tests/integration/vector_io/test_openai_vector_stores.py @@ -52,7 +52,6 @@ def skip_if_provider_doesnt_support_openai_vector_stores_search(client_with_mode ], "keyword": [ "inline::sqlite-vec", - "remote::chromadb", ], "hybrid": [ "inline::sqlite-vec", diff --git a/tests/unit/providers/vector_io/remote/test_chroma.py b/tests/unit/providers/vector_io/remote/test_chroma.py new file mode 100644 index 000000000..ea9134f99 --- /dev/null +++ b/tests/unit/providers/vector_io/remote/test_chroma.py @@ -0,0 +1,124 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from llama_stack.apis.vector_io import QueryChunksResponse + +# Mock the entire chromadb module +chromadb_mock = MagicMock() +chromadb_mock.AsyncHttpClient = MagicMock +chromadb_mock.PersistentClient = MagicMock + +# Apply the mock before importing ChromaIndex +with patch.dict("sys.modules", {"chromadb": chromadb_mock}): + from llama_stack.providers.remote.vector_io.chroma.chroma import ChromaIndex + +# This test is a unit test for the ChromaVectorIOAdapter class. This should only contain +# tests which are specific to this class. More general (API-level) tests should be placed in +# tests/integration/vector_io/ +# +# How to run this test: +# +# pytest tests/unit/providers/vector_io/test_chroma.py \ +# -v -s --tb=short --disable-warnings --asyncio-mode=auto + +CHROMA_PROVIDER = "chromadb" + + +@pytest.fixture +async def mock_chroma_collection() -> MagicMock: + """Create a mock Chroma collection with common method behaviors.""" + collection = MagicMock() + collection.name = "test_collection" + + # Mock add operation + collection.add.return_value = None + + # Mock query operation for vector search + collection.query.return_value = { + "distances": [[0.1, 0.2]], + "documents": [ + [ + json.dumps({"content": "mock chunk 1", "metadata": {"document_id": "doc1"}}), + json.dumps({"content": "mock chunk 2", "metadata": {"document_id": "doc2"}}), + ] + ], + } + + # Mock delete operation + collection.delete.return_value = None + + return collection + + +@pytest.fixture +async def mock_chroma_client(mock_chroma_collection): + """Create a mock Chroma client with common method behaviors.""" + client = MagicMock() + + # Mock collection operations + client.get_or_create_collection.return_value = mock_chroma_collection + client.get_collection.return_value = mock_chroma_collection + client.delete_collection.return_value = None + + return client + + +@pytest.fixture +async def chroma_index(mock_chroma_client, mock_chroma_collection): + """Create a ChromaIndex with mocked client and collection.""" + index = ChromaIndex(client=mock_chroma_client, collection=mock_chroma_collection) + yield index + # No real cleanup needed since we're using mocks + + +async def test_add_chunks(chroma_index, sample_chunks, sample_embeddings, mock_chroma_collection): + await chroma_index.add_chunks(sample_chunks, sample_embeddings) + + # Verify data was inserted + mock_chroma_collection.add.assert_called_once() + + # Verify the add call had the right number of chunks + add_call = mock_chroma_collection.add.call_args + assert len(add_call[1]["documents"]) == len(sample_chunks) + + +async def test_query_chunks_vector( + chroma_index, sample_chunks, sample_embeddings, embedding_dimension, mock_chroma_collection +): + # Setup: Add chunks first + await chroma_index.add_chunks(sample_chunks, sample_embeddings) + + # Test vector search + query_embedding = np.random.rand(embedding_dimension).astype(np.float32) + response = await chroma_index.query_vector(query_embedding, k=2, score_threshold=0.0) + + assert isinstance(response, QueryChunksResponse) + assert len(response.chunks) == 2 + mock_chroma_collection.query.assert_called_once() + + +async def test_query_chunks_keyword_search(chroma_index, sample_chunks, sample_embeddings, mock_chroma_collection): + await chroma_index.add_chunks(sample_chunks, sample_embeddings) + + # Test keyword search + query_string = "Sentence 5" + response = await chroma_index.query_keyword(query_string=query_string, k=2, score_threshold=0.0) + + assert isinstance(response, QueryChunksResponse) + assert len(response.chunks) == 2 + + +async def test_delete_collection(chroma_index, mock_chroma_client): + # Test collection deletion + await chroma_index.delete() + + mock_chroma_client.delete_collection.assert_called_once_with(chroma_index.collection.name)