From 8ca56484dc246659c5578672e7cfc6f410592339 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Tue, 11 Mar 2025 16:54:55 +0100 Subject: [PATCH] Removed docling-related code. --- .../inline/preprocessing/docling/__init__.py | 18 --- .../inline/preprocessing/docling/config.py | 10 -- .../inline/preprocessing/docling/docling.py | 114 ------------------ .../providers/registry/preprocessing.py | 8 -- 4 files changed, 150 deletions(-) delete mode 100644 llama_stack/providers/inline/preprocessing/docling/__init__.py delete mode 100644 llama_stack/providers/inline/preprocessing/docling/config.py delete mode 100644 llama_stack/providers/inline/preprocessing/docling/docling.py diff --git a/llama_stack/providers/inline/preprocessing/docling/__init__.py b/llama_stack/providers/inline/preprocessing/docling/__init__.py deleted file mode 100644 index 15eeccb71..000000000 --- a/llama_stack/providers/inline/preprocessing/docling/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .config import InlineDoclingConfig - - -async def get_provider_impl( - config: InlineDoclingConfig, - _deps, -): - from .docling import InclineDoclingPreprocessorImpl - - impl = InclineDoclingPreprocessorImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/inline/preprocessing/docling/config.py b/llama_stack/providers/inline/preprocessing/docling/config.py deleted file mode 100644 index 5428f7649..000000000 --- a/llama_stack/providers/inline/preprocessing/docling/config.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from pydantic import BaseModel - - -class InlineDoclingConfig(BaseModel): - chunk: bool diff --git a/llama_stack/providers/inline/preprocessing/docling/docling.py b/llama_stack/providers/inline/preprocessing/docling/docling.py deleted file mode 100644 index c292e8f89..000000000 --- a/llama_stack/providers/inline/preprocessing/docling/docling.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -import logging -from typing import List, Optional - -from docling.document_converter import DocumentConverter -from docling_core.transforms.chunker.hybrid_chunker import HybridChunker - -from llama_stack.apis.common.content_types import URL -from llama_stack.apis.preprocessing import ( - Preprocessing, - PreprocessingDataElement, - PreprocessingDataFormat, - PreprocessingDataType, - Preprocessor, - PreprocessorChain, - PreprocessorOptions, - PreprocessorResponse, -) -from llama_stack.apis.vector_io import Chunk -from llama_stack.providers.datatypes import PreprocessorsProtocolPrivate -from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConfig - -log = logging.getLogger(__name__) - - -class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate): - # this preprocessor receives URLs / paths to documents as input - input_types = [PreprocessingDataType.document_uri] - - # this preprocessor either only converts the documents into a text format, or also chunks them - output_types = [PreprocessingDataType.raw_text_document, PreprocessingDataType.chunks] - - def __init__(self, config: InlineDoclingConfig) -> None: - self.config = config - self.converter = None - self.chunker = None - - async def initialize(self) -> None: ... - - async def shutdown(self) -> None: ... - - async def register_preprocessor(self, preprocessor: Preprocessor) -> None: ... - - async def unregister_preprocessor(self, preprocessor_id: str) -> None: ... - - async def do_preprocess( - self, - preprocessor_id: str, - preprocessor_inputs: List[PreprocessingDataElement], - options: Optional[PreprocessorOptions] = None, - ) -> PreprocessorResponse: - if self.converter is None: - # this is the first time this method is called - self.converter = DocumentConverter() - if self.config.chunk and self.chunker is None: - # TODO: docling should use Llama Stack's inference API instead of handling tokenization by itself - self.chunker = HybridChunker() - - results = [] - - for inp in preprocessor_inputs: - if isinstance(inp.data_element_path_or_content, str): - url = inp.data_element_path_or_content - elif isinstance(inp.data_element_path_or_content, URL): - url = inp.data_element_path_or_content.uri - else: - log.error( - f"Unexpected type {type(inp.data_element_path_or_content)} for input {inp.data_element_path_or_content}, skipping this input." - ) - continue - - converted_document = self.converter.convert(url).document - - if self.config.chunk: - result = self.chunker.chunk(converted_document) - for i, chunk in enumerate(result): - metadata = chunk.meta.dict() - # TODO: some vector DB adapters rely on a hard-coded header 'document_id'. This should be fixed. - metadata["document_id"] = inp.data_element_id - # TODO: the RAG tool implementation relies in a hard-coded header 'token_count' - metadata["token_count"] = self.chunker._count_chunk_tokens(chunk) - raw_chunk = Chunk(content=chunk.text, metadata=metadata) - chunk_data_element = PreprocessingDataElement( - data_element_id=f"{inp.data_element_id}_chunk_{i}", - data_element_type=PreprocessingDataType.chunks, - data_element_format=PreprocessingDataFormat.txt, - data_element_path_or_content=raw_chunk, - ) - results.append(chunk_data_element) - - else: - result = PreprocessingDataElement( - data_element_id=inp.data_element_id, - data_element_type=PreprocessingDataType.raw_text_document, - data_element_format=PreprocessingDataFormat.txt, - data_element_path_or_content=converted_document.export_to_markdown(), - ) - results.append(result) - - output_data_type = ( - PreprocessingDataType.chunks if self.config.chunk else PreprocessingDataType.raw_text_document - ) - return PreprocessorResponse(success=True, output_data_type=output_data_type, results=results) - - async def preprocess( - self, - preprocessors: PreprocessorChain, - preprocessor_inputs: List[PreprocessingDataElement], - ) -> PreprocessorResponse: - return await self.do_preprocess(preprocessor_id="", preprocessor_inputs=preprocessor_inputs) diff --git a/llama_stack/providers/registry/preprocessing.py b/llama_stack/providers/registry/preprocessing.py index 00c210150..ef306e732 100644 --- a/llama_stack/providers/registry/preprocessing.py +++ b/llama_stack/providers/registry/preprocessing.py @@ -15,14 +15,6 @@ from llama_stack.providers.datatypes import ( def available_providers() -> List[ProviderSpec]: return [ - InlineProviderSpec( - api=Api.preprocessing, - provider_type="inline::docling", - pip_packages=["docling"], - module="llama_stack.providers.inline.preprocessing.docling", - config_class="llama_stack.providers.inline.preprocessing.docling.InlineDoclingConfig", - api_dependencies=[], - ), InlineProviderSpec( api=Api.preprocessing, provider_type="inline::basic",