feat: Add ChunkMetadata to Chunk (#2497)

# What does this PR do?
Adding `ChunkMetadata` so we can properly delete embeddings later.

More specifically, this PR refactors and extends the chunk metadata
handling in the vector database and introduces a distinction between
metadata used for model context and backend-only metadata required for
chunk management, storage, and retrieval. It also improves chunk ID
generation and propagation throughout the stack, enhances test coverage,
and adds new utility modules.

```python
class ChunkMetadata(BaseModel):
    """
    `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
        will NOT be inserted into the context during inference, but is required for backend functionality.
        Use `metadata` in `Chunk` for metadata that will be used during inference.
    """
    document_id: str | None = None
    chunk_id: str | None = None
    source: str | None = None
    created_timestamp: int | None = None
    updated_timestamp: int | None = None
    chunk_window: str | None = None
    chunk_tokenizer: str | None = None
    chunk_embedding_model: str | None = None
    chunk_embedding_dimension: int | None = None
    content_token_count: int | None = None
    metadata_token_count: int | None = None
```
Eventually we can migrate the document_id out of the `metadata` field.
I've introduced the changes so that `ChunkMetadata` is backwards
compatible with `metadata`.

<!-- If resolving an issue, uncomment and update the line below -->
Closes https://github.com/meta-llama/llama-stack/issues/2501 

## Test Plan
Added unit tests

---------

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
Francisco Arceo 2025-06-25 13:55:23 -06:00 committed by GitHub
parent fa0b0c13d4
commit 82f13fe83e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 490 additions and 218 deletions

View file

@ -11190,6 +11190,115 @@
], ],
"title": "InsertRequest" "title": "InsertRequest"
}, },
"Chunk": {
"type": "object",
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk that will be used in the model context during inference."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
},
"stored_chunk_id": {
"type": "string",
"description": "The chunk ID that is stored in the vector database. Used for backend functionality."
},
"chunk_metadata": {
"$ref": "#/components/schemas/ChunkMetadata",
"description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
},
"ChunkMetadata": {
"type": "object",
"properties": {
"chunk_id": {
"type": "string",
"description": "The ID of the chunk. If not set, it will be generated based on the document ID and content."
},
"document_id": {
"type": "string",
"description": "The ID of the document this chunk belongs to."
},
"source": {
"type": "string",
"description": "The source of the content, such as a URL, file path, or other identifier."
},
"created_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was created."
},
"updated_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was last updated."
},
"chunk_window": {
"type": "string",
"description": "The window of the chunk, which can be used to group related chunks together."
},
"chunk_tokenizer": {
"type": "string",
"description": "The tokenizer used to create the chunk. Default is Tiktoken."
},
"chunk_embedding_model": {
"type": "string",
"description": "The embedding model used to create the chunk's embedding."
},
"chunk_embedding_dimension": {
"type": "integer",
"description": "The dimension of the embedding vector for the chunk."
},
"content_token_count": {
"type": "integer",
"description": "The number of tokens in the content of the chunk."
},
"metadata_token_count": {
"type": "integer",
"description": "The number of tokens in the metadata of the chunk."
}
},
"additionalProperties": false,
"title": "ChunkMetadata",
"description": "`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata` is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after. Use `Chunk.metadata` for metadata that will be used in the context during inference."
},
"InsertChunksRequest": { "InsertChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -11200,53 +11309,7 @@
"chunks": { "chunks": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "$ref": "#/components/schemas/Chunk"
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
}, },
"description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later." "description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later."
}, },
@ -14671,53 +14734,7 @@
"chunks": { "chunks": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "$ref": "#/components/schemas/Chunk"
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
} }
}, },
"scores": { "scores": {

View file

@ -7867,6 +7867,107 @@ components:
- vector_db_id - vector_db_id
- chunk_size_in_tokens - chunk_size_in_tokens
title: InsertRequest title: InsertRequest
Chunk:
type: object
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images, or other
types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk that will be used in the model context
during inference.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
stored_chunk_id:
type: string
description: >-
The chunk ID that is stored in the vector database. Used for backend functionality.
chunk_metadata:
$ref: '#/components/schemas/ChunkMetadata'
description: >-
Metadata for the chunk that will NOT be used in the context during inference.
The `chunk_metadata` is required backend functionality.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
ChunkMetadata:
type: object
properties:
chunk_id:
type: string
description: >-
The ID of the chunk. If not set, it will be generated based on the document
ID and content.
document_id:
type: string
description: >-
The ID of the document this chunk belongs to.
source:
type: string
description: >-
The source of the content, such as a URL, file path, or other identifier.
created_timestamp:
type: integer
description: >-
An optional timestamp indicating when the chunk was created.
updated_timestamp:
type: integer
description: >-
An optional timestamp indicating when the chunk was last updated.
chunk_window:
type: string
description: >-
The window of the chunk, which can be used to group related chunks together.
chunk_tokenizer:
type: string
description: >-
The tokenizer used to create the chunk. Default is Tiktoken.
chunk_embedding_model:
type: string
description: >-
The embedding model used to create the chunk's embedding.
chunk_embedding_dimension:
type: integer
description: >-
The dimension of the embedding vector for the chunk.
content_token_count:
type: integer
description: >-
The number of tokens in the content of the chunk.
metadata_token_count:
type: integer
description: >-
The number of tokens in the metadata of the chunk.
additionalProperties: false
title: ChunkMetadata
description: >-
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional
information about the chunk that will not be used in the context during
inference, but is required for backend functionality. The `ChunkMetadata` is
set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not
expected to change after. Use `Chunk.metadata` for metadata that will
be used in the context during inference.
InsertChunksRequest: InsertChunksRequest:
type: object type: object
properties: properties:
@ -7877,40 +7978,7 @@ components:
chunks: chunks:
type: array type: array
items: items:
type: object $ref: '#/components/schemas/Chunk'
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images,
or other types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk, such as document ID, source,
or other relevant information.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
description: >- description: >-
The chunks to insert. Each `Chunk` should contain content which can be The chunks to insert. Each `Chunk` should contain content which can be
interleaved text, images, or other types. `metadata`: `dict[str, Any]` interleaved text, images, or other types. `metadata`: `dict[str, Any]`
@ -10231,40 +10299,7 @@ components:
chunks: chunks:
type: array type: array
items: items:
type: object $ref: '#/components/schemas/Chunk'
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images,
or other types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk, such as document ID, source,
or other relevant information.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
scores: scores:
type: array type: array
items: items:

View file

@ -8,6 +8,7 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import uuid
from typing import Annotated, Any, Literal, Protocol, runtime_checkable from typing import Annotated, Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -15,21 +16,80 @@ from pydantic import BaseModel, Field
from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.providers.utils.vector_io.chunk_utils import generate_chunk_id
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
from llama_stack.strong_typing.schema import register_schema from llama_stack.strong_typing.schema import register_schema
@json_schema_type
class ChunkMetadata(BaseModel):
"""
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata`
is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after.
Use `Chunk.metadata` for metadata that will be used in the context during inference.
:param chunk_id: The ID of the chunk. If not set, it will be generated based on the document ID and content.
:param document_id: The ID of the document this chunk belongs to.
:param source: The source of the content, such as a URL, file path, or other identifier.
:param created_timestamp: An optional timestamp indicating when the chunk was created.
:param updated_timestamp: An optional timestamp indicating when the chunk was last updated.
:param chunk_window: The window of the chunk, which can be used to group related chunks together.
:param chunk_tokenizer: The tokenizer used to create the chunk. Default is Tiktoken.
:param chunk_embedding_model: The embedding model used to create the chunk's embedding.
:param chunk_embedding_dimension: The dimension of the embedding vector for the chunk.
:param content_token_count: The number of tokens in the content of the chunk.
:param metadata_token_count: The number of tokens in the metadata of the chunk.
"""
chunk_id: str | None = None
document_id: str | None = None
source: str | None = None
created_timestamp: int | None = None
updated_timestamp: int | None = None
chunk_window: str | None = None
chunk_tokenizer: str | None = None
chunk_embedding_model: str | None = None
chunk_embedding_dimension: int | None = None
content_token_count: int | None = None
metadata_token_count: int | None = None
@json_schema_type
class Chunk(BaseModel): class Chunk(BaseModel):
""" """
A chunk of content that can be inserted into a vector database. A chunk of content that can be inserted into a vector database.
:param content: The content of the chunk, which can be interleaved text, images, or other types. :param content: The content of the chunk, which can be interleaved text, images, or other types.
:param embedding: Optional embedding for the chunk. If not provided, it will be computed later. :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
:param metadata: Metadata associated with the chunk, such as document ID, source, or other relevant information. :param metadata: Metadata associated with the chunk that will be used in the model context during inference.
:param stored_chunk_id: The chunk ID that is stored in the vector database. Used for backend functionality.
:param chunk_metadata: Metadata for the chunk that will NOT be used in the context during inference.
The `chunk_metadata` is required backend functionality.
""" """
content: InterleavedContent content: InterleavedContent
metadata: dict[str, Any] = Field(default_factory=dict) metadata: dict[str, Any] = Field(default_factory=dict)
embedding: list[float] | None = None embedding: list[float] | None = None
# The alias parameter serializes the field as "chunk_id" in JSON but keeps the internal name as "stored_chunk_id"
stored_chunk_id: str | None = Field(default=None, alias="chunk_id")
chunk_metadata: ChunkMetadata | None = None
model_config = {"populate_by_name": True}
def model_post_init(self, __context):
# Extract chunk_id from metadata if present
if self.metadata and "chunk_id" in self.metadata:
self.stored_chunk_id = self.metadata.pop("chunk_id")
@property
def chunk_id(self) -> str:
"""Returns the chunk ID, which is either an input `chunk_id` or a generated one if not set."""
if self.stored_chunk_id:
return self.stored_chunk_id
if "document_id" in self.metadata:
return generate_chunk_id(self.metadata["document_id"], str(self.content))
return generate_chunk_id(str(uuid.uuid4()), str(self.content))
@json_schema_type @json_schema_type

View file

@ -81,6 +81,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
chunks = [] chunks = []
for doc in documents: for doc in documents:
content = await content_from_doc(doc) content = await content_from_doc(doc)
# TODO: we should add enrichment here as URLs won't be added to the metadata by default
chunks.extend( chunks.extend(
make_overlapped_chunks( make_overlapped_chunks(
doc.document_id, doc.document_id,
@ -157,8 +158,24 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
break break
metadata_subset = {k: v for k, v in metadata.items() if k not in ["token_count", "metadata_token_count"]} # Add useful keys from chunk_metadata to metadata and remove some from metadata
text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset) chunk_metadata_keys_to_include_from_context = [
"chunk_id",
"document_id",
"source",
]
metadata_keys_to_exclude_from_context = [
"token_count",
"metadata_token_count",
]
metadata_for_context = {}
for k in chunk_metadata_keys_to_include_from_context:
metadata_for_context[k] = getattr(chunk.chunk_metadata, k)
for k in metadata:
if k not in metadata_keys_to_exclude_from_context:
metadata_for_context[k] = metadata[k]
text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_for_context)
picked.append(TextContentItem(text=text_content)) picked.append(TextContentItem(text=text_content))
picked.append(TextContentItem(text="END of knowledge_search tool results.\n")) picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))

View file

@ -5,12 +5,10 @@
# the root directory of this source tree. # the root directory of this source tree.
import asyncio import asyncio
import hashlib
import json import json
import logging import logging
import sqlite3 import sqlite3
import struct import struct
import uuid
from typing import Any from typing import Any
import numpy as np import numpy as np
@ -201,10 +199,7 @@ class SQLiteVecIndex(EmbeddingIndex):
batch_embeddings = embeddings[i : i + batch_size] batch_embeddings = embeddings[i : i + batch_size]
# Insert metadata # Insert metadata
metadata_data = [ metadata_data = [(chunk.chunk_id, chunk.model_dump_json()) for chunk in batch_chunks]
(generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
for chunk in batch_chunks
]
cur.executemany( cur.executemany(
f""" f"""
INSERT INTO {self.metadata_table} (id, chunk) INSERT INTO {self.metadata_table} (id, chunk)
@ -218,7 +213,7 @@ class SQLiteVecIndex(EmbeddingIndex):
embedding_data = [ embedding_data = [
( (
( (
generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.chunk_id,
serialize_vector(emb.tolist()), serialize_vector(emb.tolist()),
) )
) )
@ -230,10 +225,7 @@ class SQLiteVecIndex(EmbeddingIndex):
) )
# Insert FTS content # Insert FTS content
fts_data = [ fts_data = [(chunk.chunk_id, chunk.content) for chunk in batch_chunks]
(generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.content)
for chunk in batch_chunks
]
# DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT) # DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT)
cur.executemany( cur.executemany(
f"DELETE FROM {self.fts_table} WHERE id = ?;", f"DELETE FROM {self.fts_table} WHERE id = ?;",
@ -381,13 +373,12 @@ class SQLiteVecIndex(EmbeddingIndex):
vector_response = await self.query_vector(embedding, k, score_threshold) vector_response = await self.query_vector(embedding, k, score_threshold)
keyword_response = await self.query_keyword(query_string, k, score_threshold) keyword_response = await self.query_keyword(query_string, k, score_threshold)
# Convert responses to score dictionaries using generate_chunk_id # Convert responses to score dictionaries using chunk_id
vector_scores = { vector_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
} }
keyword_scores = { keyword_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score chunk.chunk_id: score
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False) for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
} }
@ -408,13 +399,7 @@ class SQLiteVecIndex(EmbeddingIndex):
filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold] filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
# Create a map of chunk_id to chunk for both responses # Create a map of chunk_id to chunk for both responses
chunk_map = {} chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
for c in vector_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
chunk_map[chunk_id] = c
for c in keyword_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
chunk_map[chunk_id] = c
# Use the map to look up chunks by their IDs # Use the map to look up chunks by their IDs
chunks = [] chunks = []
@ -757,9 +742,3 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
if vector_db_id not in self.cache: if vector_db_id not in self.cache:
raise ValueError(f"Vector DB {vector_db_id} not found") raise ValueError(f"Vector DB {vector_db_id} not found")
return await self.cache[vector_db_id].query_chunks(query, params) return await self.cache[vector_db_id].query_chunks(query, params)
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
hash_input = f"{document_id}:{chunk_text}".encode()
return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))

View file

@ -70,8 +70,8 @@ class QdrantIndex(EmbeddingIndex):
) )
points = [] points = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings, strict=False)): for _i, (chunk, embedding) in enumerate(zip(chunks, embeddings, strict=False)):
chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}" chunk_id = chunk.chunk_id
points.append( points.append(
PointStruct( PointStruct(
id=convert_id(chunk_id), id=convert_id(chunk_id),

View file

@ -7,6 +7,7 @@ import base64
import io import io
import logging import logging
import re import re
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
@ -23,12 +24,13 @@ from llama_stack.apis.common.content_types import (
) )
from llama_stack.apis.tools import RAGDocument from llama_stack.apis.tools import RAGDocument
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str, interleaved_content_as_str,
) )
from llama_stack.providers.utils.vector_io.chunk_utils import generate_chunk_id
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -148,6 +150,7 @@ async def content_from_doc(doc: RAGDocument) -> str:
def make_overlapped_chunks( def make_overlapped_chunks(
document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any] document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
) -> list[Chunk]: ) -> list[Chunk]:
default_tokenizer = "DEFAULT_TIKTOKEN_TOKENIZER"
tokenizer = Tokenizer.get_instance() tokenizer = Tokenizer.get_instance()
tokens = tokenizer.encode(text, bos=False, eos=False) tokens = tokenizer.encode(text, bos=False, eos=False)
try: try:
@ -161,16 +164,32 @@ def make_overlapped_chunks(
for i in range(0, len(tokens), window_len - overlap_len): for i in range(0, len(tokens), window_len - overlap_len):
toks = tokens[i : i + window_len] toks = tokens[i : i + window_len]
chunk = tokenizer.decode(toks) chunk = tokenizer.decode(toks)
chunk_id = generate_chunk_id(chunk, text)
chunk_metadata = metadata.copy() chunk_metadata = metadata.copy()
chunk_metadata["chunk_id"] = chunk_id
chunk_metadata["document_id"] = document_id chunk_metadata["document_id"] = document_id
chunk_metadata["token_count"] = len(toks) chunk_metadata["token_count"] = len(toks)
chunk_metadata["metadata_token_count"] = len(metadata_tokens) chunk_metadata["metadata_token_count"] = len(metadata_tokens)
backend_chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
document_id=document_id,
source=metadata.get("source", None),
created_timestamp=metadata.get("created_timestamp", int(time.time())),
updated_timestamp=int(time.time()),
chunk_window=f"{i}-{i + len(toks)}",
chunk_tokenizer=default_tokenizer,
chunk_embedding_model=None, # This will be set in `VectorDBWithIndex.insert_chunks`
content_token_count=len(toks),
metadata_token_count=len(metadata_tokens),
)
# chunk is a string # chunk is a string
chunks.append( chunks.append(
Chunk( Chunk(
content=chunk, content=chunk,
metadata=chunk_metadata, metadata=chunk_metadata,
chunk_metadata=backend_chunk_metadata,
) )
) )
@ -237,6 +256,9 @@ class VectorDBWithIndex:
for i, c in enumerate(chunks): for i, c in enumerate(chunks):
if c.embedding is None: if c.embedding is None:
chunks_to_embed.append(c) chunks_to_embed.append(c)
if c.chunk_metadata:
c.chunk_metadata.chunk_embedding_model = self.vector_db.embedding_model
c.chunk_metadata.chunk_embedding_dimension = self.vector_db.embedding_dimension
else: else:
_validate_embedding(c.embedding, i, self.vector_db.embedding_dimension) _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,14 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import hashlib
import uuid
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
hash_input = f"{document_id}:{chunk_text}".encode()
return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))

View file

@ -9,7 +9,7 @@ import random
import numpy as np import numpy as np
import pytest import pytest
from llama_stack.apis.vector_io import Chunk from llama_stack.apis.vector_io import Chunk, ChunkMetadata
EMBEDDING_DIMENSION = 384 EMBEDDING_DIMENSION = 384
@ -33,6 +33,20 @@ def sample_chunks():
for j in range(k) for j in range(k)
for i in range(n) for i in range(n)
] ]
sample.extend(
[
Chunk(
content=f"Sentence {i} from document {j + k}",
chunk_metadata=ChunkMetadata(
document_id=f"document-{j + k}",
chunk_id=f"document-{j}-chunk-{i}",
source=f"example source-{j + k}-{i}",
),
)
for j in range(k)
for i in range(n)
]
)
return sample return sample

View file

@ -0,0 +1,66 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.vector_io import Chunk, ChunkMetadata
from llama_stack.providers.utils.vector_io.chunk_utils import generate_chunk_id
# This test is a unit test for the chunk_utils.py helpers. This should only contain
# tests which are specific to this file. More general (API-level) tests should be placed in
# tests/integration/vector_io/
#
# How to run this test:
#
# pytest tests/unit/providers/vector_io/test_chunk_utils.py \
# -v -s --tb=short --disable-warnings --asyncio-mode=auto
def test_generate_chunk_id():
chunks = [
Chunk(content="test", metadata={"document_id": "doc-1"}),
Chunk(content="test ", metadata={"document_id": "doc-1"}),
Chunk(content="test 3", metadata={"document_id": "doc-1"}),
]
chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
assert chunk_ids == [
"177a1368-f6a8-0c50-6e92-18677f2c3de3",
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d",
]
def test_chunk_id():
# Test with existing chunk ID
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
# Test with document ID in metadata
chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
assert chunk_with_doc_id.chunk_id == generate_chunk_id("doc-1", "test")
# Test chunks with ChunkMetadata
chunk_with_metadata = Chunk(
content="test",
metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"},
chunk_metadata=ChunkMetadata(document_id="document_1"),
)
assert chunk_with_metadata.chunk_id == "chunk-id-1"
# Test with no ID or document ID
chunk_without_id = Chunk(content="test")
generated_id = chunk_without_id.chunk_id
assert isinstance(generated_id, str) and len(generated_id) == 36 # Should be a valid UUID
def test_stored_chunk_id_alias():
# Test with existing chunk ID alias
chunk_with_alias = Chunk(content="test", metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"})
assert chunk_with_alias.chunk_id == "chunk-id-1"
serialized_chunk = chunk_with_alias.model_dump()
assert serialized_chunk["stored_chunk_id"] == "chunk-id-1"
# showing chunk_id is not serialized (i.e., a computed field)
assert "chunk_id" not in serialized_chunk
assert chunk_with_alias.stored_chunk_id == "chunk-id-1"

View file

@ -81,7 +81,7 @@ __QUERY = "Sample query"
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 30)]) @pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 60)])
async def test_qdrant_adapter_returns_expected_chunks( async def test_qdrant_adapter_returns_expected_chunks(
qdrant_adapter: QdrantVectorIOAdapter, qdrant_adapter: QdrantVectorIOAdapter,
vector_db_id, vector_db_id,

View file

@ -15,7 +15,6 @@ from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
SQLiteVecIndex, SQLiteVecIndex,
SQLiteVecVectorIOAdapter, SQLiteVecVectorIOAdapter,
_create_sqlite_connection, _create_sqlite_connection,
generate_chunk_id,
) )
# This test is a unit test for the SQLiteVecVectorIOAdapter class. This should only contain # This test is a unit test for the SQLiteVecVectorIOAdapter class. This should only contain
@ -65,6 +64,14 @@ async def test_query_chunks_vector(sqlite_vec_index, sample_chunks, sample_embed
assert len(response.chunks) == 2 assert len(response.chunks) == 2
@pytest.mark.xfail(reason="Chunk Metadata not yet supported for SQLite-vec", strict=True)
async def test_query_chunk_metadata(sqlite_vec_index, sample_chunks, sample_embeddings):
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
query_embedding = sample_embeddings[0]
response = await sqlite_vec_index.query_vector(query_embedding, k=2, score_threshold=0.0)
assert response.chunks[-1].chunk_metadata == sample_chunks[-1].chunk_metadata
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sample_embeddings): async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sample_embeddings):
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@ -150,21 +157,6 @@ async def sqlite_vec_adapter(sqlite_connection):
await adapter.shutdown() await adapter.shutdown()
def test_generate_chunk_id():
chunks = [
Chunk(content="test", metadata={"document_id": "doc-1"}),
Chunk(content="test ", metadata={"document_id": "doc-1"}),
Chunk(content="test 3", metadata={"document_id": "doc-1"}),
]
chunk_ids = sorted([generate_chunk_id(chunk.metadata["document_id"], chunk.content) for chunk in chunks])
assert chunk_ids == [
"177a1368-f6a8-0c50-6e92-18677f2c3de3",
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d",
]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings): async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test hybrid search when keyword search returns no matches - should still return vector results.""" """Test hybrid search when keyword search returns no matches - should still return vector results."""
@ -339,7 +331,7 @@ async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks
# Verify scores are in descending order # Verify scores are in descending order
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
# Verify we get results from both the vector-similar document and keyword-matched document # Verify we get results from both the vector-similar document and keyword-matched document
doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks} doc_ids = {chunk.metadata.get("document_id") or chunk.chunk_metadata.document_id for chunk in response.chunks}
assert "document-0" in doc_ids # From vector search assert "document-0" in doc_ids # From vector search
assert "document-2" in doc_ids # From keyword search assert "document-2" in doc_ids # From keyword search
@ -364,7 +356,11 @@ async def test_query_chunks_hybrid_weighted_reranker_parametrization(
reranker_params={"alpha": 1.0}, reranker_params={"alpha": 1.0},
) )
assert len(response.chunks) > 0 # Should get at least one result assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) assert any(
"document-0"
in (chunk.metadata.get("document_id") or (chunk.chunk_metadata.document_id if chunk.chunk_metadata else ""))
for chunk in response.chunks
)
# alpha=0.0 (should behave like pure vector) # alpha=0.0 (should behave like pure vector)
response = await sqlite_vec_index.query_hybrid( response = await sqlite_vec_index.query_hybrid(
@ -389,7 +385,11 @@ async def test_query_chunks_hybrid_weighted_reranker_parametrization(
reranker_params={"alpha": 0.7}, reranker_params={"alpha": 0.7},
) )
assert len(response.chunks) > 0 # Should get at least one result assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) assert any(
"document-0"
in (chunk.metadata.get("document_id") or (chunk.chunk_metadata.document_id if chunk.chunk_metadata else ""))
for chunk in response.chunks
)
@pytest.mark.asyncio @pytest.mark.asyncio

View file

@ -4,10 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from unittest.mock import MagicMock from unittest.mock import AsyncMock, MagicMock
import pytest import pytest
from llama_stack.apis.vector_io import (
Chunk,
ChunkMetadata,
QueryChunksResponse,
)
from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
@ -17,3 +22,41 @@ class TestRagQuery:
rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock()) rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock())
with pytest.raises(ValueError): with pytest.raises(ValueError):
await rag_tool.query(content=MagicMock(), vector_db_ids=[]) await rag_tool.query(content=MagicMock(), vector_db_ids=[])
@pytest.mark.asyncio
async def test_query_chunk_metadata_handling(self):
rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock())
content = "test query content"
vector_db_ids = ["db1"]
chunk_metadata = ChunkMetadata(
document_id="doc1",
chunk_id="chunk1",
source="test_source",
metadata_token_count=5,
)
interleaved_content = MagicMock()
chunk = Chunk(
content=interleaved_content,
metadata={
"key1": "value1",
"token_count": 10,
"metadata_token_count": 5,
# Note this is inserted into `metadata` during MemoryToolRuntimeImpl().insert()
"document_id": "doc1",
},
stored_chunk_id="chunk1",
chunk_metadata=chunk_metadata,
)
query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0])
rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response)
result = await rag_tool.query(content=content, vector_db_ids=vector_db_ids)
assert result is not None
expected_metadata_string = (
"Metadata: {'chunk_id': 'chunk1', 'document_id': 'doc1', 'source': 'test_source', 'key1': 'value1'}"
)
assert expected_metadata_string in result.content[1].text
assert result.content is not None