diff --git a/llama_stack/apis/preprocessing/preprocessing.py b/llama_stack/apis/preprocessing/preprocessing.py index a19f018e0..440486ef3 100644 --- a/llama_stack/apis/preprocessing/preprocessing.py +++ b/llama_stack/apis/preprocessing/preprocessing.py @@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel): PreprocessorOptions = Dict[str, Any] -# TODO: shouldn't be just a string -PreprocessingResult = str + +@json_schema_type +class PreprocessingResult(BaseModel): + metadata: dict[str, Any] + data: str @json_schema_type diff --git a/llama_stack/providers/inline/preprocessing/docling/config.py b/llama_stack/providers/inline/preprocessing/docling/config.py index 9527bd6fc..78a212788 100644 --- a/llama_stack/providers/inline/preprocessing/docling/config.py +++ b/llama_stack/providers/inline/preprocessing/docling/config.py @@ -6,4 +6,6 @@ from pydantic import BaseModel -class InlineDoclingConfig(BaseModel): ... +class InlineDoclingConfig(BaseModel): + chunk: bool + tokenizer: str diff --git a/llama_stack/providers/inline/preprocessing/docling/docling.py b/llama_stack/providers/inline/preprocessing/docling/docling.py index a3794f2f8..3de4d9ed9 100644 --- a/llama_stack/providers/inline/preprocessing/docling/docling.py +++ b/llama_stack/providers/inline/preprocessing/docling/docling.py @@ -5,10 +5,15 @@ # the root directory of this source tree. from typing import List +from docling.document_converter import DocumentConverter +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker + +from llama_stack.apis.common.content_types import URL from llama_stack.apis.preprocessing import ( Preprocessing, PreprocessingInput, PreprocessingResponse, + PreprocessingResult, Preprocessor, PreprocessorOptions, ) @@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate): def __init__(self, config: InlineDoclingConfig) -> None: self.config = config + self.converter = DocumentConverter() + self.chunker = None - async def initialize(self) -> None: ... + async def initialize(self) -> None: + if self.config.chunk: + self.chunker = HybridChunker(tokenizer=self.config.tokenizer) async def shutdown(self) -> None: ... @@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate preprocessor_id: str, preprocessor_inputs: List[PreprocessingInput], options: PreprocessorOptions, - ) -> PreprocessingResponse: ... + ) -> PreprocessingResponse: + results = [] + + for inp in preprocessor_inputs: + if isinstance(inp.path_or_content, str): + url = inp.path_or_content + elif isinstance(inp.path_or_content, URL): + url = inp.path_or_content.uri + else: + raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}") + + converted_document = self.converter.convert(url).document + if self.config.chunk: + result = self.chunker.chunk(converted_document) + results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result]) + else: + result = converted_document.export_to_markdown() + results.append(result) + + return PreprocessingResponse(status=True, results=results)