feat: Add ChunkMetadata to Chunk (#2497)

# What does this PR do?
Adding `ChunkMetadata` so we can properly delete embeddings later.

More specifically, this PR refactors and extends the chunk metadata
handling in the vector database and introduces a distinction between
metadata used for model context and backend-only metadata required for
chunk management, storage, and retrieval. It also improves chunk ID
generation and propagation throughout the stack, enhances test coverage,
and adds new utility modules.

```python
class ChunkMetadata(BaseModel):
    """
    `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
        will NOT be inserted into the context during inference, but is required for backend functionality.
        Use `metadata` in `Chunk` for metadata that will be used during inference.
    """
    document_id: str | None = None
    chunk_id: str | None = None
    source: str | None = None
    created_timestamp: int | None = None
    updated_timestamp: int | None = None
    chunk_window: str | None = None
    chunk_tokenizer: str | None = None
    chunk_embedding_model: str | None = None
    chunk_embedding_dimension: int | None = None
    content_token_count: int | None = None
    metadata_token_count: int | None = None
```
Eventually we can migrate the document_id out of the `metadata` field.
I've introduced the changes so that `ChunkMetadata` is backwards
compatible with `metadata`.

<!-- If resolving an issue, uncomment and update the line below -->
Closes https://github.com/meta-llama/llama-stack/issues/2501 

## Test Plan
Added unit tests

---------

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
Francisco Arceo 2025-06-25 13:55:23 -06:00 committed by GitHub
parent fa0b0c13d4
commit 82f13fe83e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 490 additions and 218 deletions

View file

@ -11190,6 +11190,115 @@
],
"title": "InsertRequest"
},
"Chunk": {
"type": "object",
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk that will be used in the model context during inference."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
},
"stored_chunk_id": {
"type": "string",
"description": "The chunk ID that is stored in the vector database. Used for backend functionality."
},
"chunk_metadata": {
"$ref": "#/components/schemas/ChunkMetadata",
"description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
},
"ChunkMetadata": {
"type": "object",
"properties": {
"chunk_id": {
"type": "string",
"description": "The ID of the chunk. If not set, it will be generated based on the document ID and content."
},
"document_id": {
"type": "string",
"description": "The ID of the document this chunk belongs to."
},
"source": {
"type": "string",
"description": "The source of the content, such as a URL, file path, or other identifier."
},
"created_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was created."
},
"updated_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was last updated."
},
"chunk_window": {
"type": "string",
"description": "The window of the chunk, which can be used to group related chunks together."
},
"chunk_tokenizer": {
"type": "string",
"description": "The tokenizer used to create the chunk. Default is Tiktoken."
},
"chunk_embedding_model": {
"type": "string",
"description": "The embedding model used to create the chunk's embedding."
},
"chunk_embedding_dimension": {
"type": "integer",
"description": "The dimension of the embedding vector for the chunk."
},
"content_token_count": {
"type": "integer",
"description": "The number of tokens in the content of the chunk."
},
"metadata_token_count": {
"type": "integer",
"description": "The number of tokens in the metadata of the chunk."
}
},
"additionalProperties": false,
"title": "ChunkMetadata",
"description": "`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata` is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after. Use `Chunk.metadata` for metadata that will be used in the context during inference."
},
"InsertChunksRequest": {
"type": "object",
"properties": {
@ -11200,53 +11309,7 @@
"chunks": {
"type": "array",
"items": {
"type": "object",
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
"$ref": "#/components/schemas/Chunk"
},
"description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later."
},
@ -14671,53 +14734,7 @@
"chunks": {
"type": "array",
"items": {
"type": "object",
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
"$ref": "#/components/schemas/Chunk"
}
},
"scores": {