From 5014de434eb506526ef4a3b4b803302225f36bcf Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Tue, 4 Mar 2025 12:34:30 +0100 Subject: [PATCH] Added input/output type declaration. --- llama_stack/apis/preprocessing/preprocessing.py | 12 ++++++++---- .../inline/preprocessing/docling/docling.py | 7 +++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/llama_stack/apis/preprocessing/preprocessing.py b/llama_stack/apis/preprocessing/preprocessing.py index 780d5ec40..044ecd952 100644 --- a/llama_stack/apis/preprocessing/preprocessing.py +++ b/llama_stack/apis/preprocessing/preprocessing.py @@ -14,15 +14,19 @@ from llama_stack.apis.vector_io import Chunk from llama_stack.schema_utils import json_schema_type, webmethod -class PreprocessingInputType(Enum): - document_content = "document_content" - document_path = "document_path" +class PreprocessingDataType(Enum): + document_uri = "document_uri" + document_directory_uri = "document_directory_uri" + + binary_document = "binary_document" + raw_text_document = "raw_text_document" + chunks = "chunks" @json_schema_type class PreprocessingInput(BaseModel): preprocessor_input_id: str - preprocessor_input_type: Optional[PreprocessingInputType] + preprocessor_input_type: Optional[PreprocessingDataType] path_or_content: str | URL diff --git a/llama_stack/providers/inline/preprocessing/docling/docling.py b/llama_stack/providers/inline/preprocessing/docling/docling.py index 9305f7d8e..65f5bdba0 100644 --- a/llama_stack/providers/inline/preprocessing/docling/docling.py +++ b/llama_stack/providers/inline/preprocessing/docling/docling.py @@ -11,6 +11,7 @@ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from llama_stack.apis.common.content_types import URL from llama_stack.apis.preprocessing import ( Preprocessing, + PreprocessingDataType, PreprocessingInput, PreprocessingResponse, Preprocessor, @@ -22,6 +23,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate): + # this preprocessor receives URLs / paths to documents as input + INPUT_TYPES = [PreprocessingDataType.document_uri] + + # this preprocessor either only converts the documents into a text format, or also chunks them + OUTPUT_TYPES = [PreprocessingDataType.raw_text_document, PreprocessingDataType.chunks] + def __init__(self, config: InlineDoclingConfig) -> None: self.config = config self.converter = DocumentConverter()