From f4a54b9db4d73c4e1481c81e28436e0b3ff0c5ae Mon Sep 17 00:00:00 2001 From: r-bit-rry Date: Mon, 17 Nov 2025 11:03:20 +0200 Subject: [PATCH] fix(3797): sanitize metadata for attributes to avoid silent failure --- .../utils/memory/openai_vector_store_mixin.py | 5 +++- .../providers/utils/vector_io/vector_utils.py | 23 +++++++++++++++ .../providers/vector_io/test_vector_utils.py | 29 ++++++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 540ff5940..24b535721 100644 --- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -23,6 +23,9 @@ from llama_stack.providers.utils.memory.vector_store import ( content_from_data_and_mime_type, make_overlapped_chunks, ) +from llama_stack.providers.utils.vector_io.vector_utils import ( + sanitize_metadata_for_attributes, +) from llama_stack_api import ( Chunk, Files, @@ -635,7 +638,7 @@ class OpenAIVectorStoreMixin(ABC): file_id=chunk.metadata.get("document_id", ""), filename=chunk.metadata.get("filename", ""), score=score, - attributes=chunk.metadata, + attributes=sanitize_metadata_for_attributes(chunk.metadata), content=content, ) data.append(response_data_item) diff --git a/src/llama_stack/providers/utils/vector_io/vector_utils.py b/src/llama_stack/providers/utils/vector_io/vector_utils.py index 324f35405..2ee4948f7 100644 --- a/src/llama_stack/providers/utils/vector_io/vector_utils.py +++ b/src/llama_stack/providers/utils/vector_io/vector_utils.py @@ -7,6 +7,7 @@ import hashlib import re import uuid +from typing import Any def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str: @@ -37,6 +38,28 @@ def sanitize_collection_name(name: str, weaviate_format=False) -> str: return s +def sanitize_metadata_for_attributes(metadata: dict[str, Any]) -> dict[str, str | float | bool]: + """ + Filter metadata to primitives for VectorStoreSearchResponse.attributes compatibility. + + Converts dict[str, Any] to dict[str, str | float | bool]: + - Preserves: str, bool + - Converts: int/float -> float, list -> comma-separated string + - Filters: dict, None, other types + """ + sanitized: dict[str, str | float | bool] = {} + for key, value in metadata.items(): + if isinstance(value, bool): + sanitized[key] = value + elif isinstance(value, (int, float)): + sanitized[key] = float(value) + elif isinstance(value, str): + sanitized[key] = value + elif isinstance(value, list): + sanitized[key] = ", ".join(str(item) for item in value) + return sanitized + + class WeightedInMemoryAggregator: @staticmethod def _normalize_scores(scores: dict[str, float]) -> dict[str, float]: diff --git a/tests/unit/providers/vector_io/test_vector_utils.py b/tests/unit/providers/vector_io/test_vector_utils.py index 7f6b4af79..becf32575 100644 --- a/tests/unit/providers/vector_io/test_vector_utils.py +++ b/tests/unit/providers/vector_io/test_vector_utils.py @@ -4,7 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id +from llama_stack.providers.utils.vector_io.vector_utils import ( + generate_chunk_id, + sanitize_metadata_for_attributes, +) from llama_stack_api import Chunk, ChunkMetadata # This test is a unit test for the chunk_utils.py helpers. This should only contain @@ -78,3 +81,27 @@ def test_chunk_serialization(): serialized_chunk = chunk.model_dump() assert serialized_chunk["chunk_id"] == "test-chunk-id" assert "chunk_id" in serialized_chunk + + +def test_sanitize_metadata_for_attributes(): + """Test sanitization of metadata for VectorStoreSearchResponse.attributes.""" + # metadata with lists should be converted to strings + metadata = { + "tags": ["transformers", "h100-compatible", "region:us"], + "model_name": "granite-3.3-8b", + "score": 0.95, + "active": True, + "count": 42, + "nested": {"key": "value"}, # Should be filtered out + } + result = sanitize_metadata_for_attributes(metadata) + + # Lists converted to comma-separated strings + assert result["tags"] == "transformers, h100-compatible, region:us" + # Primitives preserved + assert result["model_name"] == "granite-3.3-8b" + assert result["score"] == 0.95 + assert result["active"] is True + assert result["count"] == 42.0 # int -> float + # Complex types filtered out + assert "nested" not in result