mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-20 20:32:26 +00:00
# What does this PR do?
Adding `ChunkMetadata` so we can properly delete embeddings later.
More specifically, this PR refactors and extends the chunk metadata
handling in the vector database and introduces a distinction between
metadata used for model context and backend-only metadata required for
chunk management, storage, and retrieval. It also improves chunk ID
generation and propagation throughout the stack, enhances test coverage,
and adds new utility modules.
```python
class ChunkMetadata(BaseModel):
"""
`ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
will NOT be inserted into the context during inference, but is required for backend functionality.
Use `metadata` in `Chunk` for metadata that will be used during inference.
"""
document_id: str | None = None
chunk_id: str | None = None
source: str | None = None
created_timestamp: int | None = None
updated_timestamp: int | None = None
chunk_window: str | None = None
chunk_tokenizer: str | None = None
chunk_embedding_model: str | None = None
chunk_embedding_dimension: int | None = None
content_token_count: int | None = None
metadata_token_count: int | None = None
```
Eventually we can migrate the document_id out of the `metadata` field.
I've introduced the changes so that `ChunkMetadata` is backwards
compatible with `metadata`.
<!-- If resolving an issue, uncomment and update the line below -->
Closes https://github.com/meta-llama/llama-stack/issues/2501
## Test Plan
Added unit tests
---------
Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
56 lines
1.5 KiB
Python
56 lines
1.5 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import random
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from llama_stack.apis.vector_io import Chunk, ChunkMetadata
|
|
|
|
EMBEDDING_DIMENSION = 384
|
|
|
|
|
|
@pytest.fixture
|
|
def vector_db_id() -> str:
|
|
return f"test-vector-db-{random.randint(1, 100)}"
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def embedding_dimension() -> int:
|
|
return EMBEDDING_DIMENSION
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def sample_chunks():
|
|
"""Generates chunks that force multiple batches for a single document to expose ID conflicts."""
|
|
n, k = 10, 3
|
|
sample = [
|
|
Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
|
|
for j in range(k)
|
|
for i in range(n)
|
|
]
|
|
sample.extend(
|
|
[
|
|
Chunk(
|
|
content=f"Sentence {i} from document {j + k}",
|
|
chunk_metadata=ChunkMetadata(
|
|
document_id=f"document-{j + k}",
|
|
chunk_id=f"document-{j}-chunk-{i}",
|
|
source=f"example source-{j + k}-{i}",
|
|
),
|
|
)
|
|
for j in range(k)
|
|
for i in range(n)
|
|
]
|
|
)
|
|
return sample
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def sample_embeddings(sample_chunks):
|
|
np.random.seed(42)
|
|
return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
|