fix!: remove chunk_id property from Chunk class (#3954)

# What does this PR do?

chunk_id in the Chunk class executes actual logic to compute a chunk ID.
This sort of logic should not live in the API spec.

Instead, the providers should be in charge of calling generate_chunk_id,
and pass it to `Chunk`.

this removes the incorrect dependency between Provider impl and API impl

Signed-off-by: Charlie Doern <cdoern@redhat.com>
This commit is contained in:
Charlie Doern 2025-10-29 21:59:59 -04:00 committed by GitHub
parent 0ef9166c7e
commit e8ecc99524
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 40679 additions and 135 deletions

View file

@ -43,9 +43,15 @@ def embedding_dimension() -> int:
@pytest.fixture(scope="session")
def sample_chunks():
"""Generates chunks that force multiple batches for a single document to expose ID conflicts."""
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
n, k = 10, 3
sample = [
Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
Chunk(
content=f"Sentence {i} from document {j}",
chunk_id=generate_chunk_id(f"document-{j}", f"Sentence {i} from document {j}"),
metadata={"document_id": f"document-{j}"},
)
for j in range(k)
for i in range(n)
]
@ -53,6 +59,7 @@ def sample_chunks():
[
Chunk(
content=f"Sentence {i} from document {j + k}",
chunk_id=f"document-{j}-chunk-{i}",
chunk_metadata=ChunkMetadata(
document_id=f"document-{j + k}",
chunk_id=f"document-{j}-chunk-{i}",
@ -73,6 +80,7 @@ def sample_chunks_with_metadata():
sample = [
Chunk(
content=f"Sentence {i} from document {j}",
chunk_id=f"document-{j}-chunk-{i}",
metadata={"document_id": f"document-{j}"},
chunk_metadata=ChunkMetadata(
document_id=f"document-{j}",

View file

@ -49,9 +49,21 @@ def vector_store_id():
@pytest.fixture
def sample_chunks():
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
return [
Chunk(content="MOCK text content 1", mime_type="text/plain", metadata={"document_id": "mock-doc-1"}),
Chunk(content="MOCK text content 1", mime_type="text/plain", metadata={"document_id": "mock-doc-2"}),
Chunk(
content="MOCK text content 1",
chunk_id=generate_chunk_id("mock-doc-1", "MOCK text content 1"),
mime_type="text/plain",
metadata={"document_id": "mock-doc-1"},
),
Chunk(
content="MOCK text content 1",
chunk_id=generate_chunk_id("mock-doc-2", "MOCK text content 1"),
mime_type="text/plain",
metadata={"document_id": "mock-doc-2"},
),
]

View file

@ -434,9 +434,15 @@ async def test_query_chunks_hybrid_tie_breaking(
sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
):
"""Test tie-breaking and determinism when scores are equal."""
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
# Create two chunks with the same content and embedding
chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
chunk1 = Chunk(
content="identical", chunk_id=generate_chunk_id("docA", "identical"), metadata={"document_id": "docA"}
)
chunk2 = Chunk(
content="identical", chunk_id=generate_chunk_id("docB", "identical"), metadata={"document_id": "docB"}
)
chunks = [chunk1, chunk2]
# Use the same embedding for both chunks to ensure equal scores
same_embedding = sample_embeddings[0]

View file

@ -135,10 +135,24 @@ async def test_insert_chunks_with_missing_document_id(vector_io_adapter):
vector_io_adapter.cache["db1"] = fake_index
# Various document_id scenarios that shouldn't crash
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
chunks = [
Chunk(content="has doc_id in metadata", metadata={"document_id": "doc-1"}),
Chunk(content="no doc_id anywhere", metadata={"source": "test"}),
Chunk(content="doc_id in chunk_metadata", chunk_metadata=ChunkMetadata(document_id="doc-3")),
Chunk(
content="has doc_id in metadata",
chunk_id=generate_chunk_id("doc-1", "has doc_id in metadata"),
metadata={"document_id": "doc-1"},
),
Chunk(
content="no doc_id anywhere",
chunk_id=generate_chunk_id("unknown", "no doc_id anywhere"),
metadata={"source": "test"},
),
Chunk(
content="doc_id in chunk_metadata",
chunk_id=generate_chunk_id("doc-3", "doc_id in chunk_metadata"),
chunk_metadata=ChunkMetadata(document_id="doc-3"),
),
]
# Should work without KeyError
@ -151,7 +165,9 @@ async def test_document_id_with_invalid_type_raises_error():
from llama_stack.apis.vector_io import Chunk
# Integer document_id should raise TypeError
chunk = Chunk(content="test", metadata={"document_id": 12345})
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
chunk = Chunk(content="test", chunk_id=generate_chunk_id("test", "test"), metadata={"document_id": 12345})
with pytest.raises(TypeError) as exc_info:
_ = chunk.document_id
assert "metadata['document_id'] must be a string" in str(exc_info.value)
@ -159,7 +175,9 @@ async def test_document_id_with_invalid_type_raises_error():
async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
expected = QueryChunksResponse(chunks=[Chunk(content="c1", chunk_id=generate_chunk_id("test", "c1"))], scores=[0.1])
fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))
vector_io_adapter.cache["db1"] = fake_index

View file

@ -18,13 +18,12 @@ from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
def test_generate_chunk_id():
chunks = [
Chunk(content="test", metadata={"document_id": "doc-1"}),
Chunk(content="test ", metadata={"document_id": "doc-1"}),
Chunk(content="test 3", metadata={"document_id": "doc-1"}),
]
"""Test that generate_chunk_id produces expected hashes."""
chunk_id1 = generate_chunk_id("doc-1", "test")
chunk_id2 = generate_chunk_id("doc-1", "test ")
chunk_id3 = generate_chunk_id("doc-1", "test 3")
chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
chunk_ids = sorted([chunk_id1, chunk_id2, chunk_id3])
assert chunk_ids == [
"31d1f9a3-c8d2-66e7-3c37-af2acd329778",
"d07dade7-29c0-cda7-df29-0249a1dcbc3e",
@ -33,42 +32,49 @@ def test_generate_chunk_id():
def test_generate_chunk_id_with_window():
chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
"""Test that generate_chunk_id with chunk_window produces different IDs."""
# Create a chunk object to match the original test behavior (passing object to generate_chunk_id)
chunk = Chunk(content="test", chunk_id="placeholder", metadata={"document_id": "doc-1"})
chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
# Verify that different windows produce different IDs
assert chunk_id1 != chunk_id2
assert len(chunk_id1) == 36 # Valid UUID format
assert len(chunk_id2) == 36 # Valid UUID format
def test_chunk_id():
# Test with existing chunk ID
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
# Test with document ID in metadata
chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
assert chunk_with_doc_id.chunk_id == generate_chunk_id("doc-1", "test")
# Test chunks with ChunkMetadata
chunk_with_metadata = Chunk(
def test_chunk_creation_with_explicit_id():
"""Test that chunks can be created with explicit chunk_id."""
chunk_id = generate_chunk_id("doc-1", "test")
chunk = Chunk(
content="test",
metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"},
chunk_id=chunk_id,
metadata={"document_id": "doc-1"},
)
assert chunk.chunk_id == chunk_id
assert chunk.chunk_id == "31d1f9a3-c8d2-66e7-3c37-af2acd329778"
def test_chunk_with_metadata():
"""Test chunks with ChunkMetadata."""
chunk_id = "chunk-id-1"
chunk = Chunk(
content="test",
chunk_id=chunk_id,
metadata={"document_id": "existing-id"},
chunk_metadata=ChunkMetadata(document_id="document_1"),
)
assert chunk_with_metadata.chunk_id == "chunk-id-1"
# Test with no ID or document ID
chunk_without_id = Chunk(content="test")
generated_id = chunk_without_id.chunk_id
assert isinstance(generated_id, str) and len(generated_id) == 36 # Should be a valid UUID
assert chunk.chunk_id == "chunk-id-1"
assert chunk.document_id == "existing-id" # metadata takes precedence
def test_stored_chunk_id_alias():
# Test with existing chunk ID alias
chunk_with_alias = Chunk(content="test", metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"})
assert chunk_with_alias.chunk_id == "chunk-id-1"
serialized_chunk = chunk_with_alias.model_dump()
assert serialized_chunk["stored_chunk_id"] == "chunk-id-1"
# showing chunk_id is not serialized (i.e., a computed field)
assert "chunk_id" not in serialized_chunk
assert chunk_with_alias.stored_chunk_id == "chunk-id-1"
def test_chunk_serialization():
"""Test that chunk_id is properly serialized."""
chunk = Chunk(
content="test",
chunk_id="test-chunk-id",
metadata={"document_id": "doc-1"},
)
serialized_chunk = chunk.model_dump()
assert serialized_chunk["chunk_id"] == "test-chunk-id"
assert "chunk_id" in serialized_chunk