diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index ab204a75a..7a83a9826 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -164,7 +164,8 @@ def make_overlapped_chunks( for i in range(0, len(tokens), window_len - overlap_len): toks = tokens[i : i + window_len] chunk = tokenizer.decode(toks) - chunk_id = generate_chunk_id(chunk, text) + chunk_window = f"{i}-{i + len(toks)}" + chunk_id = generate_chunk_id(chunk, text, chunk_window) chunk_metadata = metadata.copy() chunk_metadata["chunk_id"] = chunk_id chunk_metadata["document_id"] = document_id @@ -177,7 +178,7 @@ def make_overlapped_chunks( source=metadata.get("source", None), created_timestamp=metadata.get("created_timestamp", int(time.time())), updated_timestamp=int(time.time()), - chunk_window=f"{i}-{i + len(toks)}", + chunk_window=chunk_window, chunk_tokenizer=default_tokenizer, chunk_embedding_model=None, # This will be set in `VectorDBWithIndex.insert_chunks` content_token_count=len(toks), diff --git a/llama_stack/providers/utils/vector_io/chunk_utils.py b/llama_stack/providers/utils/vector_io/chunk_utils.py index 2a939bfba..01afa6ec8 100644 --- a/llama_stack/providers/utils/vector_io/chunk_utils.py +++ b/llama_stack/providers/utils/vector_io/chunk_utils.py @@ -8,7 +8,7 @@ import hashlib import uuid -def generate_chunk_id(document_id: str, chunk_text: str) -> str: +def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str: """ Generate a unique chunk ID using a hash of the document ID and chunk text. @@ -16,4 +16,6 @@ def generate_chunk_id(document_id: str, chunk_text: str) -> str: Adding usedforsecurity=False for compatibility with FIPS environments. """ hash_input = f"{document_id}:{chunk_text}".encode() + if chunk_window: + hash_input += f":{chunk_window}".encode() return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest())) diff --git a/tests/unit/providers/vector_io/test_chunk_utils.py b/tests/unit/providers/vector_io/test_chunk_utils.py index 941928b6d..535b76d73 100644 --- a/tests/unit/providers/vector_io/test_chunk_utils.py +++ b/tests/unit/providers/vector_io/test_chunk_utils.py @@ -32,6 +32,14 @@ def test_generate_chunk_id(): ] +def test_generate_chunk_id_with_window(): + chunk = Chunk(content="test", metadata={"document_id": "doc-1"}) + chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1") + chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2") + assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb" + assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154" + + def test_chunk_id(): # Test with existing chunk ID chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})