feat: Adding ChunkMetadata

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
Francisco Javier Arceo 2025-06-23 14:59:11 -04:00
parent 6fde601765
commit f90fce218e
13 changed files with 416 additions and 206 deletions

View file

@ -11190,6 +11190,110 @@
], ],
"title": "InsertRequest" "title": "InsertRequest"
}, },
"Chunk": {
"type": "object",
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk that will be used during inference."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
},
"chunk_metadata": {
"$ref": "#/components/schemas/ChunkMetadata",
"description": "Metadata for the chunk that will NOT be inserted into the context during inference that is required backend functionality."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
},
"ChunkMetadata": {
"type": "object",
"properties": {
"document_id": {
"type": "string",
"description": "The ID of the document this chunk belongs to."
},
"chunk_id": {
"type": "string"
},
"source": {
"type": "string",
"description": "The source of the content, such as a URL or file path."
},
"created_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was created."
},
"updated_timestamp": {
"type": "integer",
"description": "An optional timestamp indicating when the chunk was last updated."
},
"chunk_window": {
"type": "string",
"description": "The window of the chunk, which can be used to group related chunks together."
},
"chunk_tokenizer": {
"type": "string",
"description": "The tokenizer used to create the chunk. Default is Tiktoken."
},
"chunk_embedding_model": {
"type": "string",
"description": "The embedding model used to create the chunk's embedding."
},
"chunk_embedding_dimension": {
"type": "integer",
"description": "The dimension of the embedding vector for the chunk."
},
"content_token_count": {
"type": "integer",
"description": "The number of tokens in the content of the chunk."
},
"metadata_token_count": {
"type": "integer",
"description": "The number of tokens in the metadata of the chunk."
}
},
"additionalProperties": false,
"title": "ChunkMetadata",
"description": "`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will NOT be inserted into the context during inference, but is required for backend functionality. Use `metadata` in `Chunk` for metadata that will be used during inference."
},
"InsertChunksRequest": { "InsertChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -11200,53 +11304,7 @@
"chunks": { "chunks": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "$ref": "#/components/schemas/Chunk"
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
}, },
"description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later." "description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later."
}, },
@ -14667,53 +14725,7 @@
"chunks": { "chunks": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "$ref": "#/components/schemas/Chunk"
"properties": {
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the chunk, which can be interleaved text, images, or other types."
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
},
"embedding": {
"type": "array",
"items": {
"type": "number"
},
"description": "Optional embedding for the chunk. If not provided, it will be computed later."
}
},
"additionalProperties": false,
"required": [
"content",
"metadata"
],
"title": "Chunk",
"description": "A chunk of content that can be inserted into a vector database."
} }
}, },
"scores": { "scores": {

View file

@ -7867,6 +7867,97 @@ components:
- vector_db_id - vector_db_id
- chunk_size_in_tokens - chunk_size_in_tokens
title: InsertRequest title: InsertRequest
Chunk:
type: object
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images, or other
types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk that will be used during inference.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
chunk_metadata:
$ref: '#/components/schemas/ChunkMetadata'
description: >-
Metadata for the chunk that will NOT be inserted into the context during
inference that is required backend functionality.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
ChunkMetadata:
type: object
properties:
document_id:
type: string
description: >-
The ID of the document this chunk belongs to.
chunk_id:
type: string
source:
type: string
description: >-
The source of the content, such as a URL or file path.
created_timestamp:
type: integer
description: >-
An optional timestamp indicating when the chunk was created.
updated_timestamp:
type: integer
description: >-
An optional timestamp indicating when the chunk was last updated.
chunk_window:
type: string
description: >-
The window of the chunk, which can be used to group related chunks together.
chunk_tokenizer:
type: string
description: >-
The tokenizer used to create the chunk. Default is Tiktoken.
chunk_embedding_model:
type: string
description: >-
The embedding model used to create the chunk's embedding.
chunk_embedding_dimension:
type: integer
description: >-
The dimension of the embedding vector for the chunk.
content_token_count:
type: integer
description: >-
The number of tokens in the content of the chunk.
metadata_token_count:
type: integer
description: >-
The number of tokens in the metadata of the chunk.
additionalProperties: false
title: ChunkMetadata
description: >-
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional
information about the chunk that will NOT be inserted into the context
during inference, but is required for backend functionality. Use `metadata`
in `Chunk` for metadata that will be used during inference.
InsertChunksRequest: InsertChunksRequest:
type: object type: object
properties: properties:
@ -7877,40 +7968,7 @@ components:
chunks: chunks:
type: array type: array
items: items:
type: object $ref: '#/components/schemas/Chunk'
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images,
or other types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk, such as document ID, source,
or other relevant information.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
description: >- description: >-
The chunks to insert. Each `Chunk` should contain content which can be The chunks to insert. Each `Chunk` should contain content which can be
interleaved text, images, or other types. `metadata`: `dict[str, Any]` interleaved text, images, or other types. `metadata`: `dict[str, Any]`
@ -10227,40 +10285,7 @@ components:
chunks: chunks:
type: array type: array
items: items:
type: object $ref: '#/components/schemas/Chunk'
properties:
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the chunk, which can be interleaved text, images,
or other types.
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
Metadata associated with the chunk, such as document ID, source,
or other relevant information.
embedding:
type: array
items:
type: number
description: >-
Optional embedding for the chunk. If not provided, it will be computed
later.
additionalProperties: false
required:
- content
- metadata
title: Chunk
description: >-
A chunk of content that can be inserted into a vector database.
scores: scores:
type: array type: array
items: items:

View file

@ -19,17 +19,52 @@ from llama_stack.schema_utils import json_schema_type, webmethod
from llama_stack.strong_typing.schema import register_schema from llama_stack.strong_typing.schema import register_schema
@json_schema_type
class ChunkMetadata(BaseModel):
"""
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
will NOT be inserted into the context during inference, but is required for backend functionality.
Use `metadata` in `Chunk` for metadata that will be used during inference.
:param document_id: The ID of the document this chunk belongs to.
:param source: The source of the content, such as a URL or file path.
:param created_timestamp: An optional timestamp indicating when the chunk was created.
:param updated_timestamp: An optional timestamp indicating when the chunk was last updated.
:param chunk_window: The window of the chunk, which can be used to group related chunks together.
:param chunk_tokenizer: The tokenizer used to create the chunk. Default is Tiktoken.
:param chunk_embedding_model: The embedding model used to create the chunk's embedding.
:param chunk_embedding_dimension: The dimension of the embedding vector for the chunk.
:param content_token_count: The number of tokens in the content of the chunk.
:param metadata_token_count: The number of tokens in the metadata of the chunk.
"""
document_id: str | None = None
chunk_id: str | None = None
source: str | None = None
created_timestamp: int | None = None
updated_timestamp: int | None = None
chunk_window: str | None = None
chunk_tokenizer: str | None = None
chunk_embedding_model: str | None = None
chunk_embedding_dimension: int | None = None
content_token_count: int | None = None
metadata_token_count: int | None = None
@json_schema_type
class Chunk(BaseModel): class Chunk(BaseModel):
""" """
A chunk of content that can be inserted into a vector database. A chunk of content that can be inserted into a vector database.
:param content: The content of the chunk, which can be interleaved text, images, or other types. :param content: The content of the chunk, which can be interleaved text, images, or other types.
:param embedding: Optional embedding for the chunk. If not provided, it will be computed later. :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
:param metadata: Metadata associated with the chunk, such as document ID, source, or other relevant information. :param metadata: Metadata associated with the chunk that will be used during inference.
:param chunk_metadata: Metadata for the chunk that will NOT be inserted into the context during inference
that is required backend functionality.
""" """
content: InterleavedContent content: InterleavedContent
metadata: dict[str, Any] = Field(default_factory=dict) metadata: dict[str, Any] = Field(default_factory=dict)
embedding: list[float] | None = None embedding: list[float] | None = None
chunk_metadata: ChunkMetadata | None = None
@json_schema_type @json_schema_type

View file

@ -148,6 +148,9 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
] ]
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
metadata = chunk.metadata metadata = chunk.metadata
# update chunk.metadata with the chunk.chunk_metadata if it exists
if chunk.chunk_metadata:
metadata = {**metadata, **chunk.chunk_metadata.dict()}
tokens += metadata.get("token_count", 0) tokens += metadata.get("token_count", 0)
tokens += metadata.get("metadata_token_count", 0) tokens += metadata.get("metadata_token_count", 0)
@ -157,7 +160,19 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
break break
metadata_subset = {k: v for k, v in metadata.items() if k not in ["token_count", "metadata_token_count"]} metadata_fields_to_exclude_from_context = [
"chunk_tokenizer",
"chunk_window",
"token_count",
"metadata_token_count",
"chunk_tokenizer",
"chunk_embedding_model",
"created_timestamp",
"updated_timestamp",
"chunk_window",
"content_token_count",
]
metadata_subset = {k: v for k, v in metadata.items() if k not in metadata_fields_to_exclude_from_context}
text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset) text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset)
picked.append(TextContentItem(text=text_content)) picked.append(TextContentItem(text=text_content))

View file

@ -5,12 +5,10 @@
# the root directory of this source tree. # the root directory of this source tree.
import asyncio import asyncio
import hashlib
import json import json
import logging import logging
import sqlite3 import sqlite3
import struct import struct
import uuid
from typing import Any from typing import Any
import numpy as np import numpy as np
@ -33,6 +31,7 @@ from llama_stack.providers.utils.memory.vector_store import (
EmbeddingIndex, EmbeddingIndex,
VectorDBWithIndex, VectorDBWithIndex,
) )
from llama_stack.providers.utils.vector_io.chunk_utils import extract_or_generate_chunk_id
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -202,8 +201,7 @@ class SQLiteVecIndex(EmbeddingIndex):
# Insert metadata # Insert metadata
metadata_data = [ metadata_data = [
(generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json()) (extract_or_generate_chunk_id(chunk), chunk.model_dump_json()) for chunk in batch_chunks
for chunk in batch_chunks
] ]
cur.executemany( cur.executemany(
f""" f"""
@ -218,7 +216,7 @@ class SQLiteVecIndex(EmbeddingIndex):
embedding_data = [ embedding_data = [
( (
( (
generate_chunk_id(chunk.metadata["document_id"], chunk.content), extract_or_generate_chunk_id(chunk),
serialize_vector(emb.tolist()), serialize_vector(emb.tolist()),
) )
) )
@ -230,10 +228,7 @@ class SQLiteVecIndex(EmbeddingIndex):
) )
# Insert FTS content # Insert FTS content
fts_data = [ fts_data = [(extract_or_generate_chunk_id(chunk), chunk.content) for chunk in batch_chunks]
(generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.content)
for chunk in batch_chunks
]
# DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT) # DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT)
cur.executemany( cur.executemany(
f"DELETE FROM {self.fts_table} WHERE id = ?;", f"DELETE FROM {self.fts_table} WHERE id = ?;",
@ -383,11 +378,11 @@ class SQLiteVecIndex(EmbeddingIndex):
# Convert responses to score dictionaries using generate_chunk_id # Convert responses to score dictionaries using generate_chunk_id
vector_scores = { vector_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score extract_or_generate_chunk_id(chunk): score
for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False) for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
} }
keyword_scores = { keyword_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score extract_or_generate_chunk_id(chunk): score
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False) for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
} }
@ -410,10 +405,10 @@ class SQLiteVecIndex(EmbeddingIndex):
# Create a map of chunk_id to chunk for both responses # Create a map of chunk_id to chunk for both responses
chunk_map = {} chunk_map = {}
for c in vector_response.chunks: for c in vector_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content)) chunk_id = extract_or_generate_chunk_id(c)
chunk_map[chunk_id] = c chunk_map[chunk_id] = c
for c in keyword_response.chunks: for c in keyword_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content)) chunk_id = extract_or_generate_chunk_id(c)
chunk_map[chunk_id] = c chunk_map[chunk_id] = c
# Use the map to look up chunks by their IDs # Use the map to look up chunks by their IDs
@ -757,9 +752,3 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
if vector_db_id not in self.cache: if vector_db_id not in self.cache:
raise ValueError(f"Vector DB {vector_db_id} not found") raise ValueError(f"Vector DB {vector_db_id} not found")
return await self.cache[vector_db_id].query_chunks(query, params) return await self.cache[vector_db_id].query_chunks(query, params)
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
hash_input = f"{document_id}:{chunk_text}".encode()
return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))

View file

@ -72,7 +72,11 @@ class QdrantIndex(EmbeddingIndex):
points = [] points = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings, strict=False)): for i, (chunk, embedding) in enumerate(zip(chunks, embeddings, strict=False)):
chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}" chunk_id = (
f"{chunk.metadata.get('document_id')}:chunk-{i}"
if chunk.metadata
else f"{chunk.chunk_metadata.document_id}:chunk-{i}"
)
points.append( points.append(
PointStruct( PointStruct(
id=convert_id(chunk_id), id=convert_id(chunk_id),

View file

@ -7,6 +7,7 @@ import base64
import io import io
import logging import logging
import re import re
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
@ -23,12 +24,13 @@ from llama_stack.apis.common.content_types import (
) )
from llama_stack.apis.tools import RAGDocument from llama_stack.apis.tools import RAGDocument
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str, interleaved_content_as_str,
) )
from llama_stack.providers.utils.vector_io.chunk_utils import generate_chunk_id
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -148,6 +150,10 @@ async def content_from_doc(doc: RAGDocument) -> str:
def make_overlapped_chunks( def make_overlapped_chunks(
document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any] document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
) -> list[Chunk]: ) -> list[Chunk]:
default_tokenizer = "DEFAULT_TIKTOKEN_TOKENIZER"
default_embedding_model = (
"DEFAULT_EMBEDDING_MODEL" # This will be correctly updated in `VectorDBWithIndex.insert_chunks`
)
tokenizer = Tokenizer.get_instance() tokenizer = Tokenizer.get_instance()
tokens = tokenizer.encode(text, bos=False, eos=False) tokens = tokenizer.encode(text, bos=False, eos=False)
try: try:
@ -166,11 +172,25 @@ def make_overlapped_chunks(
chunk_metadata["token_count"] = len(toks) chunk_metadata["token_count"] = len(toks)
chunk_metadata["metadata_token_count"] = len(metadata_tokens) chunk_metadata["metadata_token_count"] = len(metadata_tokens)
backend_chunk_metadata = ChunkMetadata(
document_id=document_id,
chunk_id=generate_chunk_id(chunk, text),
source=metadata.get("source", None),
created_timestamp=metadata.get("created_timestamp", int(time.time())),
updated_timestamp=int(time.time()),
chunk_window=f"{i}-{i + len(toks)}",
chunk_tokenizer=default_tokenizer,
chunk_embedding_model=default_embedding_model,
content_token_count=len(toks),
metadata_token_count=len(metadata_tokens),
)
# chunk is a string # chunk is a string
chunks.append( chunks.append(
Chunk( Chunk(
content=chunk, content=chunk,
metadata=chunk_metadata, metadata=chunk_metadata,
chunk_metadata=backend_chunk_metadata,
) )
) )
@ -235,9 +255,13 @@ class VectorDBWithIndex:
) -> None: ) -> None:
chunks_to_embed = [] chunks_to_embed = []
for i, c in enumerate(chunks): for i, c in enumerate(chunks):
# this should be done in `make_overlapped_chunks` but we do it here for convenience
if c.embedding is None: if c.embedding is None:
chunks_to_embed.append(c) chunks_to_embed.append(c)
else: else:
if c.chunk_metadata:
c.chunk_metadata.chunk_embedding_model = self.vector_db.embedding_model
c.chunk_metadata.chunk_embedding_dimension = self.vector_db.embedding_dimension
_validate_embedding(c.embedding, i, self.vector_db.embedding_dimension) _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)
if chunks_to_embed: if chunks_to_embed:

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,42 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import hashlib
import logging
import uuid
from llama_stack.apis.vector_io import Chunk
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
hash_input = f"{document_id}:{chunk_text}".encode()
return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))
def extract_chunk_id_from_metadata(chunk: Chunk) -> str | None:
"""Extract existing chunk ID from metadata. This is for compatibility with older Chunks
that stored the document_id in the metadata and not in the ChunkMetadata."""
if chunk.chunk_metadata is not None and hasattr(chunk.chunk_metadata, "chunk_id"):
return chunk.chunk_metadata.chunk_id
if "chunk_id" in chunk.metadata:
return str(chunk.metadata["chunk_id"])
return None
def extract_or_generate_chunk_id(chunk: Chunk) -> str:
"""Extract existing chunk ID or generate a new one if not present. This is for compatibility with older Chunks
that stored the document_id in the metadata."""
stored_chunk_id = extract_chunk_id_from_metadata(chunk)
if stored_chunk_id:
return stored_chunk_id
elif "document_id" in chunk.metadata:
return generate_chunk_id(chunk.metadata["document_id"], str(chunk.content))
else:
logging.warning("Chunk has no ID or document_id in metadata. Generating random ID.")
return str(uuid.uuid4())

View file

@ -9,7 +9,7 @@ import random
import numpy as np import numpy as np
import pytest import pytest
from llama_stack.apis.vector_io import Chunk from llama_stack.apis.vector_io import Chunk, ChunkMetadata
EMBEDDING_DIMENSION = 384 EMBEDDING_DIMENSION = 384
@ -33,6 +33,20 @@ def sample_chunks():
for j in range(k) for j in range(k)
for i in range(n) for i in range(n)
] ]
sample.extend(
[
Chunk(
content=f"Sentence {i} from document {j + k}",
chunk_metadata=ChunkMetadata(
document_id=f"document-{j + k}",
chunk_id=f"document-{j}-chunk-{i}",
source=f"example source-{j + k}-{i}",
),
)
for j in range(k)
for i in range(n)
]
)
return sample return sample

View file

@ -0,0 +1,53 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.vector_io import Chunk, ChunkMetadata
from llama_stack.providers.utils.vector_io.chunk_utils import extract_or_generate_chunk_id, generate_chunk_id
# This test is a unit test for the chunk_utils.py helpers. This should only contain
# tests which are specific to this file. More general (API-level) tests should be placed in
# tests/integration/vector_io/
#
# How to run this test:
#
# pytest tests/unit/providers/vector_io/test_chunk_utils.py \
# -v -s --tb=short --disable-warnings --asyncio-mode=auto
def test_generate_chunk_id():
chunks = [
Chunk(content="test", metadata={"document_id": "doc-1"}),
Chunk(content="test ", metadata={"document_id": "doc-1"}),
Chunk(content="test 3", metadata={"document_id": "doc-1"}),
]
chunk_ids = sorted([generate_chunk_id(chunk.metadata["document_id"], chunk.content) for chunk in chunks])
assert chunk_ids == [
"177a1368-f6a8-0c50-6e92-18677f2c3de3",
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d",
]
def test_extract_or_generate_chunk_id():
# Test with existing chunk ID
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
assert extract_or_generate_chunk_id(chunk_with_id) == "84ededcc-b80b-a83e-1a20-ca6515a11350"
# Test with document ID in metadata
chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
assert extract_or_generate_chunk_id(chunk_with_doc_id) == generate_chunk_id("doc-1", "test")
# Test chunks with ChunkMetadata
chunk_with_metadata = Chunk(
content="test", metadata={"document_id": "existing-id"}, chunk_metadata=ChunkMetadata(chunk_id="chunk-id-1")
)
assert extract_or_generate_chunk_id(chunk_with_metadata) == "chunk-id-1"
# Test with no ID or document ID
chunk_without_id = Chunk(content="test")
generated_id = extract_or_generate_chunk_id(chunk_without_id)
assert isinstance(generated_id, str) and len(generated_id) == 36 # Should be a valid UUID

View file

@ -81,7 +81,7 @@ __QUERY = "Sample query"
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 30)]) @pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 60)])
async def test_qdrant_adapter_returns_expected_chunks( async def test_qdrant_adapter_returns_expected_chunks(
qdrant_adapter: QdrantVectorIOAdapter, qdrant_adapter: QdrantVectorIOAdapter,
vector_db_id, vector_db_id,

View file

@ -15,7 +15,6 @@ from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
SQLiteVecIndex, SQLiteVecIndex,
SQLiteVecVectorIOAdapter, SQLiteVecVectorIOAdapter,
_create_sqlite_connection, _create_sqlite_connection,
generate_chunk_id,
) )
# This test is a unit test for the SQLiteVecVectorIOAdapter class. This should only contain # This test is a unit test for the SQLiteVecVectorIOAdapter class. This should only contain
@ -150,21 +149,6 @@ async def sqlite_vec_adapter(sqlite_connection):
await adapter.shutdown() await adapter.shutdown()
def test_generate_chunk_id():
chunks = [
Chunk(content="test", metadata={"document_id": "doc-1"}),
Chunk(content="test ", metadata={"document_id": "doc-1"}),
Chunk(content="test 3", metadata={"document_id": "doc-1"}),
]
chunk_ids = sorted([generate_chunk_id(chunk.metadata["document_id"], chunk.content) for chunk in chunks])
assert chunk_ids == [
"177a1368-f6a8-0c50-6e92-18677f2c3de3",
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d",
]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings): async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test hybrid search when keyword search returns no matches - should still return vector results.""" """Test hybrid search when keyword search returns no matches - should still return vector results."""
@ -339,7 +323,7 @@ async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks
# Verify scores are in descending order # Verify scores are in descending order
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
# Verify we get results from both the vector-similar document and keyword-matched document # Verify we get results from both the vector-similar document and keyword-matched document
doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks} doc_ids = {chunk.metadata.get("document_id") or chunk.chunk_metadata.document_id for chunk in response.chunks}
assert "document-0" in doc_ids # From vector search assert "document-0" in doc_ids # From vector search
assert "document-2" in doc_ids # From keyword search assert "document-2" in doc_ids # From keyword search
@ -364,7 +348,11 @@ async def test_query_chunks_hybrid_weighted_reranker_parametrization(
reranker_params={"alpha": 1.0}, reranker_params={"alpha": 1.0},
) )
assert len(response.chunks) > 0 # Should get at least one result assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) assert any(
"document-0"
in (chunk.metadata.get("document_id") or (chunk.chunk_metadata.document_id if chunk.chunk_metadata else ""))
for chunk in response.chunks
)
# alpha=0.0 (should behave like pure vector) # alpha=0.0 (should behave like pure vector)
response = await sqlite_vec_index.query_hybrid( response = await sqlite_vec_index.query_hybrid(
@ -389,7 +377,11 @@ async def test_query_chunks_hybrid_weighted_reranker_parametrization(
reranker_params={"alpha": 0.7}, reranker_params={"alpha": 0.7},
) )
assert len(response.chunks) > 0 # Should get at least one result assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) assert any(
"document-0"
in (chunk.metadata.get("document_id") or (chunk.chunk_metadata.document_id if chunk.chunk_metadata else ""))
for chunk in response.chunks
)
@pytest.mark.asyncio @pytest.mark.asyncio