Initial and very basic docling support.

2026-01-07 04:39:59 +00:00 · 2025-03-03 15:17:40 +01:00 · 2025-03-03 15:17:40 +01:00 · c3515530bb
commit c3515530bb
parent aa1b670d5c
3 changed files with 38 additions and 5 deletions
--- a/llama_stack/apis/preprocessing/preprocessing.py
+++ b/llama_stack/apis/preprocessing/preprocessing.py
@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel):

 PreprocessorOptions = Dict[str, Any]

-# TODO: shouldn't be just a string
-PreprocessingResult = str
+
+@json_schema_type
+class PreprocessingResult(BaseModel):
+    metadata: dict[str, Any]
+    data: str


@json_schema_type
--- a/llama_stack/providers/inline/preprocessing/docling/config.py
+++ b/llama_stack/providers/inline/preprocessing/docling/config.py
@ -6,4 +6,6 @@
 from pydantic import BaseModel


-class InlineDoclingConfig(BaseModel): ...
+class InlineDoclingConfig(BaseModel):
+    chunk: bool
+    tokenizer: str
--- a/llama_stack/providers/inline/preprocessing/docling/docling.py
+++ b/llama_stack/providers/inline/preprocessing/docling/docling.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.
 from typing import List

+from docling.document_converter import DocumentConverter
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+
+from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.preprocessing import (
    Preprocessing,
    PreprocessingInput,
    PreprocessingResponse,
+    PreprocessingResult,
    Preprocessor,
    PreprocessorOptions,
 )
@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf
 class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
    def __init__(self, config: InlineDoclingConfig) -> None:
        self.config = config
+        self.converter = DocumentConverter()
+        self.chunker = None

-    async def initialize(self) -> None: ...
+    async def initialize(self) -> None:
+        if self.config.chunk:
+            self.chunker = HybridChunker(tokenizer=self.config.tokenizer)

    async def shutdown(self) -> None: ...

@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
        preprocessor_id: str,
        preprocessor_inputs: List[PreprocessingInput],
        options: PreprocessorOptions,
-    ) -> PreprocessingResponse: ...
+    ) -> PreprocessingResponse:
+        results = []
+
+        for inp in preprocessor_inputs:
+            if isinstance(inp.path_or_content, str):
+                url = inp.path_or_content
+            elif isinstance(inp.path_or_content, URL):
+                url = inp.path_or_content.uri
+            else:
+                raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
+
+            converted_document = self.converter.convert(url).document
+            if self.config.chunk:
+                result = self.chunker.chunk(converted_document)
+                results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result])
+            else:
+                result = converted_document.export_to_markdown()
+                results.append(result)
+
+        return PreprocessingResponse(status=True, results=results)