mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
feat: Add ChunkMetadata to Chunk (#2497)
# What does this PR do? Adding `ChunkMetadata` so we can properly delete embeddings later. More specifically, this PR refactors and extends the chunk metadata handling in the vector database and introduces a distinction between metadata used for model context and backend-only metadata required for chunk management, storage, and retrieval. It also improves chunk ID generation and propagation throughout the stack, enhances test coverage, and adds new utility modules. ```python class ChunkMetadata(BaseModel): """ `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will NOT be inserted into the context during inference, but is required for backend functionality. Use `metadata` in `Chunk` for metadata that will be used during inference. """ document_id: str | None = None chunk_id: str | None = None source: str | None = None created_timestamp: int | None = None updated_timestamp: int | None = None chunk_window: str | None = None chunk_tokenizer: str | None = None chunk_embedding_model: str | None = None chunk_embedding_dimension: int | None = None content_token_count: int | None = None metadata_token_count: int | None = None ``` Eventually we can migrate the document_id out of the `metadata` field. I've introduced the changes so that `ChunkMetadata` is backwards compatible with `metadata`. <!-- If resolving an issue, uncomment and update the line below --> Closes https://github.com/meta-llama/llama-stack/issues/2501 ## Test Plan Added unit tests --------- Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
parent
fa0b0c13d4
commit
82f13fe83e
14 changed files with 490 additions and 218 deletions
|
@ -81,6 +81,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
|
|||
chunks = []
|
||||
for doc in documents:
|
||||
content = await content_from_doc(doc)
|
||||
# TODO: we should add enrichment here as URLs won't be added to the metadata by default
|
||||
chunks.extend(
|
||||
make_overlapped_chunks(
|
||||
doc.document_id,
|
||||
|
@ -157,8 +158,24 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
|
|||
)
|
||||
break
|
||||
|
||||
metadata_subset = {k: v for k, v in metadata.items() if k not in ["token_count", "metadata_token_count"]}
|
||||
text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset)
|
||||
# Add useful keys from chunk_metadata to metadata and remove some from metadata
|
||||
chunk_metadata_keys_to_include_from_context = [
|
||||
"chunk_id",
|
||||
"document_id",
|
||||
"source",
|
||||
]
|
||||
metadata_keys_to_exclude_from_context = [
|
||||
"token_count",
|
||||
"metadata_token_count",
|
||||
]
|
||||
metadata_for_context = {}
|
||||
for k in chunk_metadata_keys_to_include_from_context:
|
||||
metadata_for_context[k] = getattr(chunk.chunk_metadata, k)
|
||||
for k in metadata:
|
||||
if k not in metadata_keys_to_exclude_from_context:
|
||||
metadata_for_context[k] = metadata[k]
|
||||
|
||||
text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_for_context)
|
||||
picked.append(TextContentItem(text=text_content))
|
||||
|
||||
picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue