mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-07 06:20:45 +00:00
chore: Updating chunk id generation to ensure uniqueness (#2618)
# What does this PR do? This handles an edge case for `generate_chunk_id` if the concatenation of the `document_id` and `chunk_text` combination are not unique. Adding the window location ensures uniqueness. ## Test Plan Added unit test Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
parent
4afd619c56
commit
ea80ea63ac
3 changed files with 14 additions and 3 deletions
|
@ -164,7 +164,8 @@ def make_overlapped_chunks(
|
||||||
for i in range(0, len(tokens), window_len - overlap_len):
|
for i in range(0, len(tokens), window_len - overlap_len):
|
||||||
toks = tokens[i : i + window_len]
|
toks = tokens[i : i + window_len]
|
||||||
chunk = tokenizer.decode(toks)
|
chunk = tokenizer.decode(toks)
|
||||||
chunk_id = generate_chunk_id(chunk, text)
|
chunk_window = f"{i}-{i + len(toks)}"
|
||||||
|
chunk_id = generate_chunk_id(chunk, text, chunk_window)
|
||||||
chunk_metadata = metadata.copy()
|
chunk_metadata = metadata.copy()
|
||||||
chunk_metadata["chunk_id"] = chunk_id
|
chunk_metadata["chunk_id"] = chunk_id
|
||||||
chunk_metadata["document_id"] = document_id
|
chunk_metadata["document_id"] = document_id
|
||||||
|
@ -177,7 +178,7 @@ def make_overlapped_chunks(
|
||||||
source=metadata.get("source", None),
|
source=metadata.get("source", None),
|
||||||
created_timestamp=metadata.get("created_timestamp", int(time.time())),
|
created_timestamp=metadata.get("created_timestamp", int(time.time())),
|
||||||
updated_timestamp=int(time.time()),
|
updated_timestamp=int(time.time()),
|
||||||
chunk_window=f"{i}-{i + len(toks)}",
|
chunk_window=chunk_window,
|
||||||
chunk_tokenizer=default_tokenizer,
|
chunk_tokenizer=default_tokenizer,
|
||||||
chunk_embedding_model=None, # This will be set in `VectorDBWithIndex.insert_chunks`
|
chunk_embedding_model=None, # This will be set in `VectorDBWithIndex.insert_chunks`
|
||||||
content_token_count=len(toks),
|
content_token_count=len(toks),
|
||||||
|
|
|
@ -8,7 +8,7 @@ import hashlib
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
|
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
|
||||||
"""
|
"""
|
||||||
Generate a unique chunk ID using a hash of the document ID and chunk text.
|
Generate a unique chunk ID using a hash of the document ID and chunk text.
|
||||||
|
|
||||||
|
@ -16,4 +16,6 @@ def generate_chunk_id(document_id: str, chunk_text: str) -> str:
|
||||||
Adding usedforsecurity=False for compatibility with FIPS environments.
|
Adding usedforsecurity=False for compatibility with FIPS environments.
|
||||||
"""
|
"""
|
||||||
hash_input = f"{document_id}:{chunk_text}".encode()
|
hash_input = f"{document_id}:{chunk_text}".encode()
|
||||||
|
if chunk_window:
|
||||||
|
hash_input += f":{chunk_window}".encode()
|
||||||
return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
|
return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
|
||||||
|
|
|
@ -32,6 +32,14 @@ def test_generate_chunk_id():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_chunk_id_with_window():
|
||||||
|
chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
|
||||||
|
chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
|
||||||
|
chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
|
||||||
|
assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
|
||||||
|
assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_id():
|
def test_chunk_id():
|
||||||
# Test with existing chunk ID
|
# Test with existing chunk ID
|
||||||
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
|
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue