mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 18:00:36 +00:00
fix(3797): sanitize metadata for attributes to avoid silent failure
This commit is contained in:
parent
97f535c4f1
commit
f4a54b9db4
3 changed files with 55 additions and 2 deletions
|
|
@ -23,6 +23,9 @@ from llama_stack.providers.utils.memory.vector_store import (
|
||||||
content_from_data_and_mime_type,
|
content_from_data_and_mime_type,
|
||||||
make_overlapped_chunks,
|
make_overlapped_chunks,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.utils.vector_io.vector_utils import (
|
||||||
|
sanitize_metadata_for_attributes,
|
||||||
|
)
|
||||||
from llama_stack_api import (
|
from llama_stack_api import (
|
||||||
Chunk,
|
Chunk,
|
||||||
Files,
|
Files,
|
||||||
|
|
@ -635,7 +638,7 @@ class OpenAIVectorStoreMixin(ABC):
|
||||||
file_id=chunk.metadata.get("document_id", ""),
|
file_id=chunk.metadata.get("document_id", ""),
|
||||||
filename=chunk.metadata.get("filename", ""),
|
filename=chunk.metadata.get("filename", ""),
|
||||||
score=score,
|
score=score,
|
||||||
attributes=chunk.metadata,
|
attributes=sanitize_metadata_for_attributes(chunk.metadata),
|
||||||
content=content,
|
content=content,
|
||||||
)
|
)
|
||||||
data.append(response_data_item)
|
data.append(response_data_item)
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@
|
||||||
import hashlib
|
import hashlib
|
||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
|
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
|
||||||
|
|
@ -37,6 +38,28 @@ def sanitize_collection_name(name: str, weaviate_format=False) -> str:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_metadata_for_attributes(metadata: dict[str, Any]) -> dict[str, str | float | bool]:
|
||||||
|
"""
|
||||||
|
Filter metadata to primitives for VectorStoreSearchResponse.attributes compatibility.
|
||||||
|
|
||||||
|
Converts dict[str, Any] to dict[str, str | float | bool]:
|
||||||
|
- Preserves: str, bool
|
||||||
|
- Converts: int/float -> float, list -> comma-separated string
|
||||||
|
- Filters: dict, None, other types
|
||||||
|
"""
|
||||||
|
sanitized: dict[str, str | float | bool] = {}
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if isinstance(value, bool):
|
||||||
|
sanitized[key] = value
|
||||||
|
elif isinstance(value, (int, float)):
|
||||||
|
sanitized[key] = float(value)
|
||||||
|
elif isinstance(value, str):
|
||||||
|
sanitized[key] = value
|
||||||
|
elif isinstance(value, list):
|
||||||
|
sanitized[key] = ", ".join(str(item) for item in value)
|
||||||
|
return sanitized
|
||||||
|
|
||||||
|
|
||||||
class WeightedInMemoryAggregator:
|
class WeightedInMemoryAggregator:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
|
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,10 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
|
from llama_stack.providers.utils.vector_io.vector_utils import (
|
||||||
|
generate_chunk_id,
|
||||||
|
sanitize_metadata_for_attributes,
|
||||||
|
)
|
||||||
from llama_stack_api import Chunk, ChunkMetadata
|
from llama_stack_api import Chunk, ChunkMetadata
|
||||||
|
|
||||||
# This test is a unit test for the chunk_utils.py helpers. This should only contain
|
# This test is a unit test for the chunk_utils.py helpers. This should only contain
|
||||||
|
|
@ -78,3 +81,27 @@ def test_chunk_serialization():
|
||||||
serialized_chunk = chunk.model_dump()
|
serialized_chunk = chunk.model_dump()
|
||||||
assert serialized_chunk["chunk_id"] == "test-chunk-id"
|
assert serialized_chunk["chunk_id"] == "test-chunk-id"
|
||||||
assert "chunk_id" in serialized_chunk
|
assert "chunk_id" in serialized_chunk
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_metadata_for_attributes():
|
||||||
|
"""Test sanitization of metadata for VectorStoreSearchResponse.attributes."""
|
||||||
|
# metadata with lists should be converted to strings
|
||||||
|
metadata = {
|
||||||
|
"tags": ["transformers", "h100-compatible", "region:us"],
|
||||||
|
"model_name": "granite-3.3-8b",
|
||||||
|
"score": 0.95,
|
||||||
|
"active": True,
|
||||||
|
"count": 42,
|
||||||
|
"nested": {"key": "value"}, # Should be filtered out
|
||||||
|
}
|
||||||
|
result = sanitize_metadata_for_attributes(metadata)
|
||||||
|
|
||||||
|
# Lists converted to comma-separated strings
|
||||||
|
assert result["tags"] == "transformers, h100-compatible, region:us"
|
||||||
|
# Primitives preserved
|
||||||
|
assert result["model_name"] == "granite-3.3-8b"
|
||||||
|
assert result["score"] == 0.95
|
||||||
|
assert result["active"] is True
|
||||||
|
assert result["count"] == 42.0 # int -> float
|
||||||
|
# Complex types filtered out
|
||||||
|
assert "nested" not in result
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue