diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index 1bc00d1d3..e0e9d0679 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -144,7 +144,12 @@ def make_overlapped_chunks( ) -> list[Chunk]: tokenizer = Tokenizer.get_instance() tokens = tokenizer.encode(text, bos=False, eos=False) - metadata_tokens = tokenizer.encode(str(metadata), bos=False, eos=False) + try: + metadata_string = str(metadata) + except Exception as e: + raise ValueError("Failed to serialize metadata to string") from e + + metadata_tokens = tokenizer.encode(metadata_string, bos=False, eos=False) chunks = [] for i in range(0, len(tokens), window_len - overlap_len): diff --git a/tests/unit/rag/test_vector_store.py b/tests/unit/rag/test_vector_store.py index 58cb3cc4b..611ede9d4 100644 --- a/tests/unit/rag/test_vector_store.py +++ b/tests/unit/rag/test_vector_store.py @@ -108,3 +108,22 @@ class TestVectorStore: assert isinstance(chunk.metadata["token_count"], int) assert chunk.metadata["token_count"] > 0 assert chunk.metadata["metadata_token_count"] == len_metadata_tokens + + def test_raise_overlapped_chunks_metadata_serialization_error(self): + document_id = "test_doc_ex" + text = "Some text" + window_len = 5 + overlap_len = 2 + + class BadMetadata: + def __repr__(self): + raise TypeError("Cannot convert to string") + + problematic_metadata = {"bad_metadata_example": BadMetadata()} + + with pytest.raises(ValueError) as excinfo: + make_overlapped_chunks(document_id, text, window_len, overlap_len, problematic_metadata) + + assert str(excinfo.value) == "Failed to serialize metadata to string" + assert isinstance(excinfo.value.__cause__, TypeError) + assert str(excinfo.value.__cause__) == "Cannot convert to string"