mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-07 06:20:45 +00:00
fix: handle encoding errors when adding files to vector store (#2574)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 7s
Integration Tests / test-matrix (server, 3.12, datasets) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.12, agents) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.12, inference) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.12, inspect) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.12, post_training) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.12, providers) (push) Failing after 11s
Integration Tests / test-matrix (server, 3.12, scoring) (push) Failing after 14s
Integration Tests / test-matrix (server, 3.12, tool_runtime) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.13, agents) (push) Failing after 11s
Integration Tests / test-matrix (server, 3.12, vector_io) (push) Failing after 14s
Integration Tests / test-matrix (server, 3.13, datasets) (push) Failing after 12s
Integration Tests / test-matrix (server, 3.13, inference) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.13, inspect) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.13, post_training) (push) Failing after 7s
Integration Tests / test-matrix (server, 3.13, scoring) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, providers) (push) Failing after 9s
Integration Tests / test-matrix (server, 3.13, vector_io) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, tool_runtime) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 7s
Test Llama Stack Build / generate-matrix (push) Successful in 5s
Python Package Build Test / build (3.13) (push) Failing after 1s
Python Package Build Test / build (3.12) (push) Failing after 1s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test External Providers / test-external-providers (venv) (push) Failing after 6s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 6s
Test Llama Stack Build / build (push) Failing after 5s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 45s
Test Llama Stack Build / build-single-provider (push) Failing after 37s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 33s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 43s
Pre-commit / pre-commit (push) Successful in 1m35s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 7s
Integration Tests / test-matrix (server, 3.12, datasets) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.12, agents) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.12, inference) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.12, inspect) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.12, post_training) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.12, providers) (push) Failing after 11s
Integration Tests / test-matrix (server, 3.12, scoring) (push) Failing after 14s
Integration Tests / test-matrix (server, 3.12, tool_runtime) (push) Failing after 13s
Integration Tests / test-matrix (server, 3.13, agents) (push) Failing after 11s
Integration Tests / test-matrix (server, 3.12, vector_io) (push) Failing after 14s
Integration Tests / test-matrix (server, 3.13, datasets) (push) Failing after 12s
Integration Tests / test-matrix (server, 3.13, inference) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.13, inspect) (push) Failing after 8s
Integration Tests / test-matrix (server, 3.13, post_training) (push) Failing after 7s
Integration Tests / test-matrix (server, 3.13, scoring) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, providers) (push) Failing after 9s
Integration Tests / test-matrix (server, 3.13, vector_io) (push) Failing after 6s
Integration Tests / test-matrix (server, 3.13, tool_runtime) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 7s
Test Llama Stack Build / generate-matrix (push) Successful in 5s
Python Package Build Test / build (3.13) (push) Failing after 1s
Python Package Build Test / build (3.12) (push) Failing after 1s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test External Providers / test-external-providers (venv) (push) Failing after 6s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 6s
Test Llama Stack Build / build (push) Failing after 5s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 45s
Test Llama Stack Build / build-single-provider (push) Failing after 37s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 33s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 43s
Pre-commit / pre-commit (push) Successful in 1m35s
- Add try-catch block around data.decode() to handle UnicodeDecodeError - Implement UTF-8 fallback when detected encoding fails - Return empty string when both encodings fail - add unit tests Fixes #2572: UnicodeDecodeError when uploading files with problematic encodings Signed-off-by: Derek Higgins <derekh@redhat.com>
This commit is contained in:
parent
f1c62e0af0
commit
f77d4d91f5
2 changed files with 57 additions and 2 deletions
|
@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
|
||||||
mime_category = mime_type.split("/")[0] if mime_type else None
|
mime_category = mime_type.split("/")[0] if mime_type else None
|
||||||
if mime_category == "text":
|
if mime_category == "text":
|
||||||
# For text-based files (including CSV, MD)
|
# For text-based files (including CSV, MD)
|
||||||
return data.decode(encoding)
|
encodings_to_try = [encoding]
|
||||||
|
if encoding != "utf-8":
|
||||||
|
encodings_to_try.append("utf-8")
|
||||||
|
first_exception = None
|
||||||
|
for encoding in encodings_to_try:
|
||||||
|
try:
|
||||||
|
return data.decode(encoding)
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
if first_exception is None:
|
||||||
|
first_exception = e
|
||||||
|
log.warning(f"Decoding failed with {encoding}: {e}")
|
||||||
|
# raise the origional exception, if we got here there was at least 1 exception
|
||||||
|
log.error(f"Could not decode data as any of {encodings_to_try}")
|
||||||
|
raise first_exception
|
||||||
|
|
||||||
elif mime_type == "application/pdf":
|
elif mime_type == "application/pdf":
|
||||||
return parse_pdf(data)
|
return parse_pdf(data)
|
||||||
|
|
|
@ -10,7 +10,7 @@ import pytest
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import URL, TextContentItem
|
from llama_stack.apis.common.content_types import URL, TextContentItem
|
||||||
from llama_stack.apis.tools import RAGDocument
|
from llama_stack.apis.tools import RAGDocument
|
||||||
from llama_stack.providers.utils.memory.vector_store import content_from_doc
|
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -143,3 +143,45 @@ async def test_content_from_doc_with_interleaved_content():
|
||||||
|
|
||||||
assert result == "First item\nSecond item"
|
assert result == "First item\nSecond item"
|
||||||
mock_interleaved.assert_called_once_with(interleaved_content)
|
mock_interleaved.assert_called_once_with(interleaved_content)
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_from_data_and_mime_type_success_utf8():
|
||||||
|
"""Test successful decoding with UTF-8 encoding."""
|
||||||
|
data = "Hello World! 🌍".encode()
|
||||||
|
mime_type = "text/plain"
|
||||||
|
|
||||||
|
with patch("chardet.detect") as mock_detect:
|
||||||
|
mock_detect.return_value = {"encoding": "utf-8"}
|
||||||
|
|
||||||
|
result = content_from_data_and_mime_type(data, mime_type)
|
||||||
|
|
||||||
|
mock_detect.assert_called_once_with(data)
|
||||||
|
assert result == "Hello World! 🌍"
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_from_data_and_mime_type_error_win1252():
|
||||||
|
"""Test fallback to UTF-8 when Windows-1252 encoding detection fails."""
|
||||||
|
data = "Hello World! 🌍".encode()
|
||||||
|
mime_type = "text/plain"
|
||||||
|
|
||||||
|
with patch("chardet.detect") as mock_detect:
|
||||||
|
mock_detect.return_value = {"encoding": "Windows-1252"}
|
||||||
|
|
||||||
|
result = content_from_data_and_mime_type(data, mime_type)
|
||||||
|
|
||||||
|
assert result == "Hello World! 🌍"
|
||||||
|
mock_detect.assert_called_once_with(data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_from_data_and_mime_type_both_encodings_fail():
|
||||||
|
"""Test that exceptions are raised when both primary and UTF-8 encodings fail."""
|
||||||
|
# Create invalid byte sequence that fails with both encodings
|
||||||
|
data = b"\xff\xfe\x00\x8f" # Invalid UTF-8 sequence
|
||||||
|
mime_type = "text/plain"
|
||||||
|
|
||||||
|
with patch("chardet.detect") as mock_detect:
|
||||||
|
mock_detect.return_value = {"encoding": "windows-1252"}
|
||||||
|
|
||||||
|
# Should raise an exception instead of returning empty string
|
||||||
|
with pytest.raises(UnicodeDecodeError):
|
||||||
|
content_from_data_and_mime_type(data, mime_type)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue