mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
feat: Add ChunkMetadata to Chunk (#2497)
# What does this PR do? Adding `ChunkMetadata` so we can properly delete embeddings later. More specifically, this PR refactors and extends the chunk metadata handling in the vector database and introduces a distinction between metadata used for model context and backend-only metadata required for chunk management, storage, and retrieval. It also improves chunk ID generation and propagation throughout the stack, enhances test coverage, and adds new utility modules. ```python class ChunkMetadata(BaseModel): """ `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will NOT be inserted into the context during inference, but is required for backend functionality. Use `metadata` in `Chunk` for metadata that will be used during inference. """ document_id: str | None = None chunk_id: str | None = None source: str | None = None created_timestamp: int | None = None updated_timestamp: int | None = None chunk_window: str | None = None chunk_tokenizer: str | None = None chunk_embedding_model: str | None = None chunk_embedding_dimension: int | None = None content_token_count: int | None = None metadata_token_count: int | None = None ``` Eventually we can migrate the document_id out of the `metadata` field. I've introduced the changes so that `ChunkMetadata` is backwards compatible with `metadata`. <!-- If resolving an issue, uncomment and update the line below --> Closes https://github.com/meta-llama/llama-stack/issues/2501 ## Test Plan Added unit tests --------- Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
parent
fa0b0c13d4
commit
82f13fe83e
14 changed files with 490 additions and 218 deletions
205
docs/_static/llama-stack-spec.html
vendored
205
docs/_static/llama-stack-spec.html
vendored
|
@ -11190,6 +11190,115 @@
|
|||
],
|
||||
"title": "InsertRequest"
|
||||
},
|
||||
"Chunk": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"$ref": "#/components/schemas/InterleavedContent",
|
||||
"description": "The content of the chunk, which can be interleaved text, images, or other types."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Metadata associated with the chunk that will be used in the model context during inference."
|
||||
},
|
||||
"embedding": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
|
||||
},
|
||||
"stored_chunk_id": {
|
||||
"type": "string",
|
||||
"description": "The chunk ID that is stored in the vector database. Used for backend functionality."
|
||||
},
|
||||
"chunk_metadata": {
|
||||
"$ref": "#/components/schemas/ChunkMetadata",
|
||||
"description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"metadata"
|
||||
],
|
||||
"title": "Chunk",
|
||||
"description": "A chunk of content that can be inserted into a vector database."
|
||||
},
|
||||
"ChunkMetadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"chunk_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the chunk. If not set, it will be generated based on the document ID and content."
|
||||
},
|
||||
"document_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the document this chunk belongs to."
|
||||
},
|
||||
"source": {
|
||||
"type": "string",
|
||||
"description": "The source of the content, such as a URL, file path, or other identifier."
|
||||
},
|
||||
"created_timestamp": {
|
||||
"type": "integer",
|
||||
"description": "An optional timestamp indicating when the chunk was created."
|
||||
},
|
||||
"updated_timestamp": {
|
||||
"type": "integer",
|
||||
"description": "An optional timestamp indicating when the chunk was last updated."
|
||||
},
|
||||
"chunk_window": {
|
||||
"type": "string",
|
||||
"description": "The window of the chunk, which can be used to group related chunks together."
|
||||
},
|
||||
"chunk_tokenizer": {
|
||||
"type": "string",
|
||||
"description": "The tokenizer used to create the chunk. Default is Tiktoken."
|
||||
},
|
||||
"chunk_embedding_model": {
|
||||
"type": "string",
|
||||
"description": "The embedding model used to create the chunk's embedding."
|
||||
},
|
||||
"chunk_embedding_dimension": {
|
||||
"type": "integer",
|
||||
"description": "The dimension of the embedding vector for the chunk."
|
||||
},
|
||||
"content_token_count": {
|
||||
"type": "integer",
|
||||
"description": "The number of tokens in the content of the chunk."
|
||||
},
|
||||
"metadata_token_count": {
|
||||
"type": "integer",
|
||||
"description": "The number of tokens in the metadata of the chunk."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"title": "ChunkMetadata",
|
||||
"description": "`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata` is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after. Use `Chunk.metadata` for metadata that will be used in the context during inference."
|
||||
},
|
||||
"InsertChunksRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -11200,53 +11309,7 @@
|
|||
"chunks": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"$ref": "#/components/schemas/InterleavedContent",
|
||||
"description": "The content of the chunk, which can be interleaved text, images, or other types."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
|
||||
},
|
||||
"embedding": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"metadata"
|
||||
],
|
||||
"title": "Chunk",
|
||||
"description": "A chunk of content that can be inserted into a vector database."
|
||||
"$ref": "#/components/schemas/Chunk"
|
||||
},
|
||||
"description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later."
|
||||
},
|
||||
|
@ -14671,53 +14734,7 @@
|
|||
"chunks": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"$ref": "#/components/schemas/InterleavedContent",
|
||||
"description": "The content of the chunk, which can be interleaved text, images, or other types."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
|
||||
},
|
||||
"embedding": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"metadata"
|
||||
],
|
||||
"title": "Chunk",
|
||||
"description": "A chunk of content that can be inserted into a vector database."
|
||||
"$ref": "#/components/schemas/Chunk"
|
||||
}
|
||||
},
|
||||
"scores": {
|
||||
|
|
171
docs/_static/llama-stack-spec.yaml
vendored
171
docs/_static/llama-stack-spec.yaml
vendored
|
@ -7867,6 +7867,107 @@ components:
|
|||
- vector_db_id
|
||||
- chunk_size_in_tokens
|
||||
title: InsertRequest
|
||||
Chunk:
|
||||
type: object
|
||||
properties:
|
||||
content:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
description: >-
|
||||
The content of the chunk, which can be interleaved text, images, or other
|
||||
types.
|
||||
metadata:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: >-
|
||||
Metadata associated with the chunk that will be used in the model context
|
||||
during inference.
|
||||
embedding:
|
||||
type: array
|
||||
items:
|
||||
type: number
|
||||
description: >-
|
||||
Optional embedding for the chunk. If not provided, it will be computed
|
||||
later.
|
||||
stored_chunk_id:
|
||||
type: string
|
||||
description: >-
|
||||
The chunk ID that is stored in the vector database. Used for backend functionality.
|
||||
chunk_metadata:
|
||||
$ref: '#/components/schemas/ChunkMetadata'
|
||||
description: >-
|
||||
Metadata for the chunk that will NOT be used in the context during inference.
|
||||
The `chunk_metadata` is required backend functionality.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- metadata
|
||||
title: Chunk
|
||||
description: >-
|
||||
A chunk of content that can be inserted into a vector database.
|
||||
ChunkMetadata:
|
||||
type: object
|
||||
properties:
|
||||
chunk_id:
|
||||
type: string
|
||||
description: >-
|
||||
The ID of the chunk. If not set, it will be generated based on the document
|
||||
ID and content.
|
||||
document_id:
|
||||
type: string
|
||||
description: >-
|
||||
The ID of the document this chunk belongs to.
|
||||
source:
|
||||
type: string
|
||||
description: >-
|
||||
The source of the content, such as a URL, file path, or other identifier.
|
||||
created_timestamp:
|
||||
type: integer
|
||||
description: >-
|
||||
An optional timestamp indicating when the chunk was created.
|
||||
updated_timestamp:
|
||||
type: integer
|
||||
description: >-
|
||||
An optional timestamp indicating when the chunk was last updated.
|
||||
chunk_window:
|
||||
type: string
|
||||
description: >-
|
||||
The window of the chunk, which can be used to group related chunks together.
|
||||
chunk_tokenizer:
|
||||
type: string
|
||||
description: >-
|
||||
The tokenizer used to create the chunk. Default is Tiktoken.
|
||||
chunk_embedding_model:
|
||||
type: string
|
||||
description: >-
|
||||
The embedding model used to create the chunk's embedding.
|
||||
chunk_embedding_dimension:
|
||||
type: integer
|
||||
description: >-
|
||||
The dimension of the embedding vector for the chunk.
|
||||
content_token_count:
|
||||
type: integer
|
||||
description: >-
|
||||
The number of tokens in the content of the chunk.
|
||||
metadata_token_count:
|
||||
type: integer
|
||||
description: >-
|
||||
The number of tokens in the metadata of the chunk.
|
||||
additionalProperties: false
|
||||
title: ChunkMetadata
|
||||
description: >-
|
||||
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional
|
||||
information about the chunk that will not be used in the context during
|
||||
inference, but is required for backend functionality. The `ChunkMetadata` is
|
||||
set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not
|
||||
expected to change after. Use `Chunk.metadata` for metadata that will
|
||||
be used in the context during inference.
|
||||
InsertChunksRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -7877,40 +7978,7 @@ components:
|
|||
chunks:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
content:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
description: >-
|
||||
The content of the chunk, which can be interleaved text, images,
|
||||
or other types.
|
||||
metadata:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: >-
|
||||
Metadata associated with the chunk, such as document ID, source,
|
||||
or other relevant information.
|
||||
embedding:
|
||||
type: array
|
||||
items:
|
||||
type: number
|
||||
description: >-
|
||||
Optional embedding for the chunk. If not provided, it will be computed
|
||||
later.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- metadata
|
||||
title: Chunk
|
||||
description: >-
|
||||
A chunk of content that can be inserted into a vector database.
|
||||
$ref: '#/components/schemas/Chunk'
|
||||
description: >-
|
||||
The chunks to insert. Each `Chunk` should contain content which can be
|
||||
interleaved text, images, or other types. `metadata`: `dict[str, Any]`
|
||||
|
@ -10231,40 +10299,7 @@ components:
|
|||
chunks:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
content:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
description: >-
|
||||
The content of the chunk, which can be interleaved text, images,
|
||||
or other types.
|
||||
metadata:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: >-
|
||||
Metadata associated with the chunk, such as document ID, source,
|
||||
or other relevant information.
|
||||
embedding:
|
||||
type: array
|
||||
items:
|
||||
type: number
|
||||
description: >-
|
||||
Optional embedding for the chunk. If not provided, it will be computed
|
||||
later.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- metadata
|
||||
title: Chunk
|
||||
description: >-
|
||||
A chunk of content that can be inserted into a vector database.
|
||||
$ref: '#/components/schemas/Chunk'
|
||||
scores:
|
||||
type: array
|
||||
items:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue