mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-26 03:02:00 +00:00
fix: handle encoding errors when adding files to vector store
- Add try-catch block around data.decode() to handle UnicodeDecodeError - Implement UTF-8 fallback when detected encoding fails - re-raise origional exception if fallback fails - add unit tests Fixes #2572: UnicodeDecodeError when uploading files with problematic encodings Signed-off-by: Derek Higgins <derekh@redhat.com>
This commit is contained in:
parent
b246b0660e
commit
87b8530e3f
2 changed files with 57 additions and 2 deletions
|
|
@ -10,7 +10,7 @@ import pytest
|
|||
|
||||
from llama_stack.apis.common.content_types import URL, TextContentItem
|
||||
from llama_stack.apis.tools import RAGDocument
|
||||
from llama_stack.providers.utils.memory.vector_store import content_from_doc
|
||||
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
@ -143,3 +143,45 @@ async def test_content_from_doc_with_interleaved_content():
|
|||
|
||||
assert result == "First item\nSecond item"
|
||||
mock_interleaved.assert_called_once_with(interleaved_content)
|
||||
|
||||
|
||||
def test_content_from_data_and_mime_type_success_utf8():
|
||||
"""Test successful decoding with UTF-8 encoding."""
|
||||
data = "Hello World! 🌍".encode()
|
||||
mime_type = "text/plain"
|
||||
|
||||
with patch("chardet.detect") as mock_detect:
|
||||
mock_detect.return_value = {"encoding": "utf-8"}
|
||||
|
||||
result = content_from_data_and_mime_type(data, mime_type)
|
||||
|
||||
mock_detect.assert_called_once_with(data)
|
||||
assert result == "Hello World! 🌍"
|
||||
|
||||
|
||||
def test_content_from_data_and_mime_type_error_win1252():
|
||||
"""Test fallback to UTF-8 when Windows-1252 encoding detection fails."""
|
||||
data = "Hello World! 🌍".encode()
|
||||
mime_type = "text/plain"
|
||||
|
||||
with patch("chardet.detect") as mock_detect:
|
||||
mock_detect.return_value = {"encoding": "Windows-1252"}
|
||||
|
||||
result = content_from_data_and_mime_type(data, mime_type)
|
||||
|
||||
assert result == "Hello World! 🌍"
|
||||
mock_detect.assert_called_once_with(data)
|
||||
|
||||
|
||||
def test_content_from_data_and_mime_type_both_encodings_fail():
|
||||
"""Test that exceptions are raised when both primary and UTF-8 encodings fail."""
|
||||
# Create invalid byte sequence that fails with both encodings
|
||||
data = b"\xff\xfe\x00\x8f" # Invalid UTF-8 sequence
|
||||
mime_type = "text/plain"
|
||||
|
||||
with patch("chardet.detect") as mock_detect:
|
||||
mock_detect.return_value = {"encoding": "windows-1252"}
|
||||
|
||||
# Should raise an exception instead of returning empty string
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
content_from_data_and_mime_type(data, mime_type)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue