Initial and very basic docling support.

This commit is contained in:
ilya-kolchinsky 2025-03-03 15:17:40 +01:00
parent aa1b670d5c
commit c3515530bb
3 changed files with 38 additions and 5 deletions

View file

@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel):
PreprocessorOptions = Dict[str, Any] PreprocessorOptions = Dict[str, Any]
# TODO: shouldn't be just a string
PreprocessingResult = str @json_schema_type
class PreprocessingResult(BaseModel):
metadata: dict[str, Any]
data: str
@json_schema_type @json_schema_type

View file

@ -6,4 +6,6 @@
from pydantic import BaseModel from pydantic import BaseModel
class InlineDoclingConfig(BaseModel): ... class InlineDoclingConfig(BaseModel):
chunk: bool
tokenizer: str

View file

@ -5,10 +5,15 @@
# the root directory of this source tree. # the root directory of this source tree.
from typing import List from typing import List
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.preprocessing import ( from llama_stack.apis.preprocessing import (
Preprocessing, Preprocessing,
PreprocessingInput, PreprocessingInput,
PreprocessingResponse, PreprocessingResponse,
PreprocessingResult,
Preprocessor, Preprocessor,
PreprocessorOptions, PreprocessorOptions,
) )
@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf
class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate): class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
def __init__(self, config: InlineDoclingConfig) -> None: def __init__(self, config: InlineDoclingConfig) -> None:
self.config = config self.config = config
self.converter = DocumentConverter()
self.chunker = None
async def initialize(self) -> None: ... async def initialize(self) -> None:
if self.config.chunk:
self.chunker = HybridChunker(tokenizer=self.config.tokenizer)
async def shutdown(self) -> None: ... async def shutdown(self) -> None: ...
@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
preprocessor_id: str, preprocessor_id: str,
preprocessor_inputs: List[PreprocessingInput], preprocessor_inputs: List[PreprocessingInput],
options: PreprocessorOptions, options: PreprocessorOptions,
) -> PreprocessingResponse: ... ) -> PreprocessingResponse:
results = []
for inp in preprocessor_inputs:
if isinstance(inp.path_or_content, str):
url = inp.path_or_content
elif isinstance(inp.path_or_content, URL):
url = inp.path_or_content.uri
else:
raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
converted_document = self.converter.convert(url).document
if self.config.chunk:
result = self.chunker.chunk(converted_document)
results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result])
else:
result = converted_document.export_to_markdown()
results.append(result)
return PreprocessingResponse(status=True, results=results)