From f77d4d91f56dd876f4041679ce62d270988407bb Mon Sep 17 00:00:00 2001 From: Derek Higgins Date: Fri, 4 Jul 2025 11:10:18 +0100 Subject: [PATCH] fix: handle encoding errors when adding files to vector store (#2574) - Add try-catch block around data.decode() to handle UnicodeDecodeError - Implement UTF-8 fallback when detected encoding fails - Return empty string when both encodings fail - add unit tests Fixes #2572: UnicodeDecodeError when uploading files with problematic encodings Signed-off-by: Derek Higgins --- .../providers/utils/memory/vector_store.py | 15 ++++++- .../utils/memory/test_vector_store.py | 44 ++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index 7a83a9826..f892d33c6 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en mime_category = mime_type.split("/")[0] if mime_type else None if mime_category == "text": # For text-based files (including CSV, MD) - return data.decode(encoding) + encodings_to_try = [encoding] + if encoding != "utf-8": + encodings_to_try.append("utf-8") + first_exception = None + for encoding in encodings_to_try: + try: + return data.decode(encoding) + except UnicodeDecodeError as e: + if first_exception is None: + first_exception = e + log.warning(f"Decoding failed with {encoding}: {e}") + # raise the origional exception, if we got here there was at least 1 exception + log.error(f"Could not decode data as any of {encodings_to_try}") + raise first_exception elif mime_type == "application/pdf": return parse_pdf(data) diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py index 4a3c33a6b..220c21994 100644 --- a/tests/unit/providers/utils/memory/test_vector_store.py +++ b/tests/unit/providers/utils/memory/test_vector_store.py @@ -10,7 +10,7 @@ import pytest from llama_stack.apis.common.content_types import URL, TextContentItem from llama_stack.apis.tools import RAGDocument -from llama_stack.providers.utils.memory.vector_store import content_from_doc +from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc @pytest.mark.asyncio @@ -143,3 +143,45 @@ async def test_content_from_doc_with_interleaved_content(): assert result == "First item\nSecond item" mock_interleaved.assert_called_once_with(interleaved_content) + + +def test_content_from_data_and_mime_type_success_utf8(): + """Test successful decoding with UTF-8 encoding.""" + data = "Hello World! 🌍".encode() + mime_type = "text/plain" + + with patch("chardet.detect") as mock_detect: + mock_detect.return_value = {"encoding": "utf-8"} + + result = content_from_data_and_mime_type(data, mime_type) + + mock_detect.assert_called_once_with(data) + assert result == "Hello World! 🌍" + + +def test_content_from_data_and_mime_type_error_win1252(): + """Test fallback to UTF-8 when Windows-1252 encoding detection fails.""" + data = "Hello World! 🌍".encode() + mime_type = "text/plain" + + with patch("chardet.detect") as mock_detect: + mock_detect.return_value = {"encoding": "Windows-1252"} + + result = content_from_data_and_mime_type(data, mime_type) + + assert result == "Hello World! 🌍" + mock_detect.assert_called_once_with(data) + + +def test_content_from_data_and_mime_type_both_encodings_fail(): + """Test that exceptions are raised when both primary and UTF-8 encodings fail.""" + # Create invalid byte sequence that fails with both encodings + data = b"\xff\xfe\x00\x8f" # Invalid UTF-8 sequence + mime_type = "text/plain" + + with patch("chardet.detect") as mock_detect: + mock_detect.return_value = {"encoding": "windows-1252"} + + # Should raise an exception instead of returning empty string + with pytest.raises(UnicodeDecodeError): + content_from_data_and_mime_type(data, mime_type)