mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-25 12:38:03 +00:00
fix: handle encoding errors when adding files to vector store
- Add try-catch block around data.decode() to handle UnicodeDecodeError - Implement UTF-8 fallback when detected encoding fails - re-raise origional exception if fallback fails - add unit tests Fixes #2572: UnicodeDecodeError when uploading files with problematic encodings Signed-off-by: Derek Higgins <derekh@redhat.com>
This commit is contained in:
parent
b246b0660e
commit
87b8530e3f
2 changed files with 57 additions and 2 deletions
|
|
@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
|
|||
mime_category = mime_type.split("/")[0] if mime_type else None
|
||||
if mime_category == "text":
|
||||
# For text-based files (including CSV, MD)
|
||||
return data.decode(encoding)
|
||||
encodings_to_try = [encoding]
|
||||
if encoding != "utf-8":
|
||||
encodings_to_try.append("utf-8")
|
||||
first_exception = None
|
||||
for encoding in encodings_to_try:
|
||||
try:
|
||||
return data.decode(encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
if first_exception is None:
|
||||
first_exception = e
|
||||
log.warning(f"Decoding failed with {encoding}: {e}")
|
||||
# raise the origional exception, if we got here there was at least 1 exception
|
||||
log.error(f"Could not decode data as any of {encodings_to_try}")
|
||||
raise first_exception
|
||||
|
||||
elif mime_type == "application/pdf":
|
||||
return parse_pdf(data)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue