Merge branch 'main' into vectordb_name

2025-12-23 22:43:54 +00:00 · 2025-07-06 15:40:20 -04:00 · 2025-07-06 15:40:20 -04:00 · 74b0ab69ed
commit 74b0ab69ed
parent ddb29b306c 4bca4af3e4
161 changed files with 1844 additions and 11065 deletions
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -92,7 +92,20 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
    mime_category = mime_type.split("/")[0] if mime_type else None
    if mime_category == "text":
        # For text-based files (including CSV, MD)
-        return data.decode(encoding)
+        encodings_to_try = [encoding]
+        if encoding != "utf-8":
+            encodings_to_try.append("utf-8")
+        first_exception = None
+        for encoding in encodings_to_try:
+            try:
+                return data.decode(encoding)
+            except UnicodeDecodeError as e:
+                if first_exception is None:
+                    first_exception = e
+                log.warning(f"Decoding failed with {encoding}: {e}")
+        # raise the origional exception, if we got here there was at least 1 exception
+        log.error(f"Could not decode data as any of {encodings_to_try}")
+        raise first_exception

    elif mime_type == "application/pdf":
        return parse_pdf(data)
@ -164,7 +177,8 @@ def make_overlapped_chunks(
    for i in range(0, len(tokens), window_len - overlap_len):
        toks = tokens[i : i + window_len]
        chunk = tokenizer.decode(toks)
-        chunk_id = generate_chunk_id(chunk, text)
+        chunk_window = f"{i}-{i + len(toks)}"
+        chunk_id = generate_chunk_id(chunk, text, chunk_window)
        chunk_metadata = metadata.copy()
        chunk_metadata["chunk_id"] = chunk_id
        chunk_metadata["document_id"] = document_id
@ -177,7 +191,7 @@ def make_overlapped_chunks(
            source=metadata.get("source", None),
            created_timestamp=metadata.get("created_timestamp", int(time.time())),
            updated_timestamp=int(time.time()),
-            chunk_window=f"{i}-{i + len(toks)}",
+            chunk_window=chunk_window,
            chunk_tokenizer=default_tokenizer,
            chunk_embedding_model=None,  # This will be set in `VectorDBWithIndex.insert_chunks`
            content_token_count=len(toks),
--- a/llama_stack/providers/utils/vector_io/chunk_utils.py
+++ b/llama_stack/providers/utils/vector_io/chunk_utils.py
@ -8,7 +8,7 @@ import hashlib
 import uuid


-def generate_chunk_id(document_id: str, chunk_text: str) -> str:
+def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
    """
    Generate a unique chunk ID using a hash of the document ID and chunk text.

@ -16,4 +16,6 @@ def generate_chunk_id(document_id: str, chunk_text: str) -> str:
    Adding usedforsecurity=False for compatibility with FIPS environments.
    """
    hash_input = f"{document_id}:{chunk_text}".encode()
+    if chunk_window:
+        hash_input += f":{chunk_window}".encode()
    return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))