From f77d4d91f56dd876f4041679ce62d270988407bb Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Fri, 4 Jul 2025 11:10:18 +0100
Subject: [PATCH] fix: handle encoding errors when adding files to vector store
 (#2574)

- Add try-catch block around data.decode() to handle UnicodeDecodeError
- Implement UTF-8 fallback when detected encoding fails
- Return empty string when both encodings fail
- add unit tests

Fixes #2572: UnicodeDecodeError when uploading files with problematic
encodings

Signed-off-by: Derek Higgins <derekh@redhat.com>
---
 .../providers/utils/memory/vector_store.py    | 15 ++++++-
 .../utils/memory/test_vector_store.py         | 44 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 7a83a9826..f892d33c6 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
     mime_category = mime_type.split("/")[0] if mime_type else None
     if mime_category == "text":
         # For text-based files (including CSV, MD)
-        return data.decode(encoding)
+        encodings_to_try = [encoding]
+        if encoding != "utf-8":
+            encodings_to_try.append("utf-8")
+        first_exception = None
+        for encoding in encodings_to_try:
+            try:
+                return data.decode(encoding)
+            except UnicodeDecodeError as e:
+                if first_exception is None:
+                    first_exception = e
+                log.warning(f"Decoding failed with {encoding}: {e}")
+        # raise the origional exception, if we got here there was at least 1 exception
+        log.error(f"Could not decode data as any of {encodings_to_try}")
+        raise first_exception
 
     elif mime_type == "application/pdf":
         return parse_pdf(data)
diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py
index 4a3c33a6b..220c21994 100644
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@@ -10,7 +10,7 @@ import pytest
 
 from llama_stack.apis.common.content_types import URL, TextContentItem
 from llama_stack.apis.tools import RAGDocument
-from llama_stack.providers.utils.memory.vector_store import content_from_doc
+from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
 
 
 @pytest.mark.asyncio
@@ -143,3 +143,45 @@ async def test_content_from_doc_with_interleaved_content():
 
         assert result == "First item\nSecond item"
         mock_interleaved.assert_called_once_with(interleaved_content)
+
+
+def test_content_from_data_and_mime_type_success_utf8():
+    """Test successful decoding with UTF-8 encoding."""
+    data = "Hello World! 🌍".encode()
+    mime_type = "text/plain"
+
+    with patch("chardet.detect") as mock_detect:
+        mock_detect.return_value = {"encoding": "utf-8"}
+
+        result = content_from_data_and_mime_type(data, mime_type)
+
+        mock_detect.assert_called_once_with(data)
+        assert result == "Hello World! 🌍"
+
+
+def test_content_from_data_and_mime_type_error_win1252():
+    """Test fallback to UTF-8 when Windows-1252 encoding detection fails."""
+    data = "Hello World! 🌍".encode()
+    mime_type = "text/plain"
+
+    with patch("chardet.detect") as mock_detect:
+        mock_detect.return_value = {"encoding": "Windows-1252"}
+
+        result = content_from_data_and_mime_type(data, mime_type)
+
+        assert result == "Hello World! 🌍"
+        mock_detect.assert_called_once_with(data)
+
+
+def test_content_from_data_and_mime_type_both_encodings_fail():
+    """Test that exceptions are raised when both primary and UTF-8 encodings fail."""
+    # Create invalid byte sequence that fails with both encodings
+    data = b"\xff\xfe\x00\x8f"  # Invalid UTF-8 sequence
+    mime_type = "text/plain"
+
+    with patch("chardet.detect") as mock_detect:
+        mock_detect.return_value = {"encoding": "windows-1252"}
+
+        # Should raise an exception instead of returning empty string
+        with pytest.raises(UnicodeDecodeError):
+            content_from_data_and_mime_type(data, mime_type)