fix!: remove chunk_id property from Chunk class (#3954)

# What does this PR do?

chunk_id in the Chunk class executes actual logic to compute a chunk ID.
This sort of logic should not live in the API spec.

Instead, the providers should be in charge of calling generate_chunk_id,
and pass it to `Chunk`.

this removes the incorrect dependency between Provider impl and API impl

Signed-off-by: Charlie Doern <cdoern@redhat.com>
This commit is contained in:
Charlie Doern 2025-10-29 21:59:59 -04:00 committed by GitHub
parent 0ef9166c7e
commit e8ecc99524
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 40679 additions and 135 deletions

View file

@ -43,9 +43,15 @@ def embedding_dimension() -> int:
@pytest.fixture(scope="session")
def sample_chunks():
"""Generates chunks that force multiple batches for a single document to expose ID conflicts."""
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
n, k = 10, 3
sample = [
Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
Chunk(
content=f"Sentence {i} from document {j}",
chunk_id=generate_chunk_id(f"document-{j}", f"Sentence {i} from document {j}"),
metadata={"document_id": f"document-{j}"},
)
for j in range(k)
for i in range(n)
]
@ -53,6 +59,7 @@ def sample_chunks():
[
Chunk(
content=f"Sentence {i} from document {j + k}",
chunk_id=f"document-{j}-chunk-{i}",
chunk_metadata=ChunkMetadata(
document_id=f"document-{j + k}",
chunk_id=f"document-{j}-chunk-{i}",
@ -73,6 +80,7 @@ def sample_chunks_with_metadata():
sample = [
Chunk(
content=f"Sentence {i} from document {j}",
chunk_id=f"document-{j}-chunk-{i}",
metadata={"document_id": f"document-{j}"},
chunk_metadata=ChunkMetadata(
document_id=f"document-{j}",