chore: Updating chunk id generation to ensure uniqueness (#2618)

# What does this PR do? This handles an edge case for `generate_chunk_id` if the concatenation of the `document_id` and `chunk_text` combination are not unique. Adding the window location ensures uniqueness. ## Test Plan Added unit test Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-07-04 00:56:35 -04:00 · 2025-07-04 00:56:35 -04:00 · ea80ea63ac
commit ea80ea63ac
parent 4afd619c56
3 changed files with 14 additions and 3 deletions
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -164,7 +164,8 @@ def make_overlapped_chunks(
    for i in range(0, len(tokens), window_len - overlap_len):
        toks = tokens[i : i + window_len]
        chunk = tokenizer.decode(toks)
-        chunk_id = generate_chunk_id(chunk, text)
+        chunk_window = f"{i}-{i + len(toks)}"
+        chunk_id = generate_chunk_id(chunk, text, chunk_window)
        chunk_metadata = metadata.copy()
        chunk_metadata["chunk_id"] = chunk_id
        chunk_metadata["document_id"] = document_id
@ -177,7 +178,7 @@ def make_overlapped_chunks(
            source=metadata.get("source", None),
            created_timestamp=metadata.get("created_timestamp", int(time.time())),
            updated_timestamp=int(time.time()),
-            chunk_window=f"{i}-{i + len(toks)}",
+            chunk_window=chunk_window,
            chunk_tokenizer=default_tokenizer,
            chunk_embedding_model=None,  # This will be set in `VectorDBWithIndex.insert_chunks`
            content_token_count=len(toks),