Initial and very basic docling support.

2026-01-03 10:52:17 +00:00 · 2025-03-03 15:17:40 +01:00 · 2025-03-03 15:17:40 +01:00 · c3515530bb
commit c3515530bb
parent aa1b670d5c
3 changed files with 38 additions and 5 deletions
--- a/llama_stack/apis/preprocessing/preprocessing.py
+++ b/llama_stack/apis/preprocessing/preprocessing.py
@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel):
 PreprocessorOptions = Dict[str, Any]
-# TODO: shouldn't be just a string
+
-PreprocessingResult = str
+@json_schema_type
 class PreprocessingResult(BaseModel):
    metadata: dict[str, Any]
    data: str
@json_schema_type
--- a/llama_stack/providers/inline/preprocessing/docling/config.py
+++ b/llama_stack/providers/inline/preprocessing/docling/config.py
@ -6,4 +6,6 @@
 from pydantic import BaseModel
-class InlineDoclingConfig(BaseModel): ...
+class InlineDoclingConfig(BaseModel):
    chunk: bool
    tokenizer: str
--- a/llama_stack/providers/inline/preprocessing/docling/docling.py
+++ b/llama_stack/providers/inline/preprocessing/docling/docling.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.
 from typing import List
 from docling.document_converter import DocumentConverter
 from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.preprocessing import (
    Preprocessing,
    PreprocessingInput,
    PreprocessingResponse,
    PreprocessingResult,
    Preprocessor,
    PreprocessorOptions,
 )
@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf
 class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
    def __init__(self, config: InlineDoclingConfig) -> None:
        self.config = config
        self.converter = DocumentConverter()
        self.chunker = None
-    async def initialize(self) -> None: ...
+    async def initialize(self) -> None:
        if self.config.chunk:
            self.chunker = HybridChunker(tokenizer=self.config.tokenizer)
    async def shutdown(self) -> None: ...
@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
        preprocessor_id: str,
        preprocessor_inputs: List[PreprocessingInput],
        options: PreprocessorOptions,
-    ) -> PreprocessingResponse: ...
+    ) -> PreprocessingResponse:
        results = []
        for inp in preprocessor_inputs:
            if isinstance(inp.path_or_content, str):
                url = inp.path_or_content
            elif isinstance(inp.path_or_content, URL):
                url = inp.path_or_content.uri
            else:
                raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
            converted_document = self.converter.convert(url).document
            if self.config.chunk:
                result = self.chunker.chunk(converted_document)
                results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result])
            else:
                result = converted_document.export_to_markdown()
                results.append(result)
        return PreprocessingResponse(status=True, results=results)