fix: handle encoding errors when adding files to vector store

- Add try-catch block around data.decode() to handle UnicodeDecodeError
- Implement UTF-8 fallback when detected encoding fails
- re-raise origional exception if fallback fails
- add unit tests

Fixes #2572: UnicodeDecodeError when uploading files with problematic encodings

Signed-off-by: Derek Higgins <derekh@redhat.com>
This commit is contained in:
Derek Higgins 2025-07-01 14:45:36 +01:00
parent b246b0660e
commit 87b8530e3f
2 changed files with 57 additions and 2 deletions

View file

@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
mime_category = mime_type.split("/")[0] if mime_type else None
if mime_category == "text":
# For text-based files (including CSV, MD)
encodings_to_try = [encoding]
if encoding != "utf-8":
encodings_to_try.append("utf-8")
first_exception = None
for encoding in encodings_to_try:
try:
return data.decode(encoding)
except UnicodeDecodeError as e:
if first_exception is None:
first_exception = e
log.warning(f"Decoding failed with {encoding}: {e}")
# raise the origional exception, if we got here there was at least 1 exception
log.error(f"Could not decode data as any of {encodings_to_try}")
raise first_exception
elif mime_type == "application/pdf":
return parse_pdf(data)

View file

@ -10,7 +10,7 @@ import pytest
from llama_stack.apis.common.content_types import URL, TextContentItem
from llama_stack.apis.tools import RAGDocument
from llama_stack.providers.utils.memory.vector_store import content_from_doc
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
@pytest.mark.asyncio
@ -143,3 +143,45 @@ async def test_content_from_doc_with_interleaved_content():
assert result == "First item\nSecond item"
mock_interleaved.assert_called_once_with(interleaved_content)
def test_content_from_data_and_mime_type_success_utf8():
"""Test successful decoding with UTF-8 encoding."""
data = "Hello World! 🌍".encode()
mime_type = "text/plain"
with patch("chardet.detect") as mock_detect:
mock_detect.return_value = {"encoding": "utf-8"}
result = content_from_data_and_mime_type(data, mime_type)
mock_detect.assert_called_once_with(data)
assert result == "Hello World! 🌍"
def test_content_from_data_and_mime_type_error_win1252():
"""Test fallback to UTF-8 when Windows-1252 encoding detection fails."""
data = "Hello World! 🌍".encode()
mime_type = "text/plain"
with patch("chardet.detect") as mock_detect:
mock_detect.return_value = {"encoding": "Windows-1252"}
result = content_from_data_and_mime_type(data, mime_type)
assert result == "Hello World! 🌍"
mock_detect.assert_called_once_with(data)
def test_content_from_data_and_mime_type_both_encodings_fail():
"""Test that exceptions are raised when both primary and UTF-8 encodings fail."""
# Create invalid byte sequence that fails with both encodings
data = b"\xff\xfe\x00\x8f" # Invalid UTF-8 sequence
mime_type = "text/plain"
with patch("chardet.detect") as mock_detect:
mock_detect.return_value = {"encoding": "windows-1252"}
# Should raise an exception instead of returning empty string
with pytest.raises(UnicodeDecodeError):
content_from_data_and_mime_type(data, mime_type)