chore: Updating chunk id generation to ensure uniqueness

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
Francisco Javier Arceo 2025-07-03 23:34:38 -04:00
parent 4afd619c56
commit 0bd7a07454
3 changed files with 14 additions and 3 deletions

View file

@ -8,7 +8,7 @@ import hashlib
import uuid
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
"""
Generate a unique chunk ID using a hash of the document ID and chunk text.
@ -16,4 +16,6 @@ def generate_chunk_id(document_id: str, chunk_text: str) -> str:
Adding usedforsecurity=False for compatibility with FIPS environments.
"""
hash_input = f"{document_id}:{chunk_text}".encode()
if chunk_window:
hash_input += f":{chunk_window}".encode()
return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))