Added draft implementation of built-in preprocessing for RAG.

2026-01-02 16:50:01 +00:00 · 2025-03-04 15:22:29 +01:00 · 2025-03-04 15:22:29 +01:00 · 1a6e71c61f
commit 1a6e71c61f
parent 5014de434e
9 changed files with 299 additions and 4 deletions
--- a/llama_stack/providers/inline/preprocessing/docling/docling.py
+++ b/llama_stack/providers/inline/preprocessing/docling/docling.py
@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import logging
 from typing import List

 from docling.document_converter import DocumentConverter
@ -21,6 +22,8 @@ from llama_stack.apis.vector_io import Chunk
 from llama_stack.providers.datatypes import PreprocessorsProtocolPrivate
 from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConfig

+log = logging.getLogger(__name__)
+

 class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
    # this preprocessor receives URLs / paths to documents as input
@ -58,7 +61,10 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
            elif isinstance(inp.path_or_content, URL):
                url = inp.path_or_content.uri
            else:
-                raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
+                log.error(
+                    f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}, skipping this input."
+                )
+                continue

            converted_document = self.converter.convert(url).document
            if self.config.chunk: