From 8ca56484dc246659c5578672e7cfc6f410592339 Mon Sep 17 00:00:00 2001
From: ilya-kolchinsky <ilya.kolchinsky@gmail.com>
Date: Tue, 11 Mar 2025 16:54:55 +0100
Subject: [PATCH] Removed docling-related code.

---
 .../inline/preprocessing/docling/__init__.py  |  18 ---
 .../inline/preprocessing/docling/config.py    |  10 --
 .../inline/preprocessing/docling/docling.py   | 114 ------------------
 .../providers/registry/preprocessing.py       |   8 --
 4 files changed, 150 deletions(-)
 delete mode 100644 llama_stack/providers/inline/preprocessing/docling/__init__.py
 delete mode 100644 llama_stack/providers/inline/preprocessing/docling/config.py
 delete mode 100644 llama_stack/providers/inline/preprocessing/docling/docling.py

diff --git a/llama_stack/providers/inline/preprocessing/docling/__init__.py b/llama_stack/providers/inline/preprocessing/docling/__init__.py
deleted file mode 100644
index 15eeccb71..000000000
--- a/llama_stack/providers/inline/preprocessing/docling/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import InlineDoclingConfig
-
-
-async def get_provider_impl(
-    config: InlineDoclingConfig,
-    _deps,
-):
-    from .docling import InclineDoclingPreprocessorImpl
-
-    impl = InclineDoclingPreprocessorImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/inline/preprocessing/docling/config.py b/llama_stack/providers/inline/preprocessing/docling/config.py
deleted file mode 100644
index 5428f7649..000000000
--- a/llama_stack/providers/inline/preprocessing/docling/config.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from pydantic import BaseModel
-
-
-class InlineDoclingConfig(BaseModel):
-    chunk: bool
diff --git a/llama_stack/providers/inline/preprocessing/docling/docling.py b/llama_stack/providers/inline/preprocessing/docling/docling.py
deleted file mode 100644
index c292e8f89..000000000
--- a/llama_stack/providers/inline/preprocessing/docling/docling.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import logging
-from typing import List, Optional
-
-from docling.document_converter import DocumentConverter
-from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.preprocessing import (
-    Preprocessing,
-    PreprocessingDataElement,
-    PreprocessingDataFormat,
-    PreprocessingDataType,
-    Preprocessor,
-    PreprocessorChain,
-    PreprocessorOptions,
-    PreprocessorResponse,
-)
-from llama_stack.apis.vector_io import Chunk
-from llama_stack.providers.datatypes import PreprocessorsProtocolPrivate
-from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConfig
-
-log = logging.getLogger(__name__)
-
-
-class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
-    # this preprocessor receives URLs / paths to documents as input
-    input_types = [PreprocessingDataType.document_uri]
-
-    # this preprocessor either only converts the documents into a text format, or also chunks them
-    output_types = [PreprocessingDataType.raw_text_document, PreprocessingDataType.chunks]
-
-    def __init__(self, config: InlineDoclingConfig) -> None:
-        self.config = config
-        self.converter = None
-        self.chunker = None
-
-    async def initialize(self) -> None: ...
-
-    async def shutdown(self) -> None: ...
-
-    async def register_preprocessor(self, preprocessor: Preprocessor) -> None: ...
-
-    async def unregister_preprocessor(self, preprocessor_id: str) -> None: ...
-
-    async def do_preprocess(
-        self,
-        preprocessor_id: str,
-        preprocessor_inputs: List[PreprocessingDataElement],
-        options: Optional[PreprocessorOptions] = None,
-    ) -> PreprocessorResponse:
-        if self.converter is None:
-            # this is the first time this method is called
-            self.converter = DocumentConverter()
-            if self.config.chunk and self.chunker is None:
-                # TODO: docling should use Llama Stack's inference API instead of handling tokenization by itself
-                self.chunker = HybridChunker()
-
-        results = []
-
-        for inp in preprocessor_inputs:
-            if isinstance(inp.data_element_path_or_content, str):
-                url = inp.data_element_path_or_content
-            elif isinstance(inp.data_element_path_or_content, URL):
-                url = inp.data_element_path_or_content.uri
-            else:
-                log.error(
-                    f"Unexpected type {type(inp.data_element_path_or_content)} for input {inp.data_element_path_or_content}, skipping this input."
-                )
-                continue
-
-            converted_document = self.converter.convert(url).document
-
-            if self.config.chunk:
-                result = self.chunker.chunk(converted_document)
-                for i, chunk in enumerate(result):
-                    metadata = chunk.meta.dict()
-                    # TODO: some vector DB adapters rely on a hard-coded header 'document_id'. This should be fixed.
-                    metadata["document_id"] = inp.data_element_id
-                    # TODO: the RAG tool implementation relies in a hard-coded header 'token_count'
-                    metadata["token_count"] = self.chunker._count_chunk_tokens(chunk)
-                    raw_chunk = Chunk(content=chunk.text, metadata=metadata)
-                    chunk_data_element = PreprocessingDataElement(
-                        data_element_id=f"{inp.data_element_id}_chunk_{i}",
-                        data_element_type=PreprocessingDataType.chunks,
-                        data_element_format=PreprocessingDataFormat.txt,
-                        data_element_path_or_content=raw_chunk,
-                    )
-                    results.append(chunk_data_element)
-
-            else:
-                result = PreprocessingDataElement(
-                    data_element_id=inp.data_element_id,
-                    data_element_type=PreprocessingDataType.raw_text_document,
-                    data_element_format=PreprocessingDataFormat.txt,
-                    data_element_path_or_content=converted_document.export_to_markdown(),
-                )
-                results.append(result)
-
-        output_data_type = (
-            PreprocessingDataType.chunks if self.config.chunk else PreprocessingDataType.raw_text_document
-        )
-        return PreprocessorResponse(success=True, output_data_type=output_data_type, results=results)
-
-    async def preprocess(
-        self,
-        preprocessors: PreprocessorChain,
-        preprocessor_inputs: List[PreprocessingDataElement],
-    ) -> PreprocessorResponse:
-        return await self.do_preprocess(preprocessor_id="", preprocessor_inputs=preprocessor_inputs)
diff --git a/llama_stack/providers/registry/preprocessing.py b/llama_stack/providers/registry/preprocessing.py
index 00c210150..ef306e732 100644
--- a/llama_stack/providers/registry/preprocessing.py
+++ b/llama_stack/providers/registry/preprocessing.py
@@ -15,14 +15,6 @@ from llama_stack.providers.datatypes import (
 
 def available_providers() -> List[ProviderSpec]:
     return [
-        InlineProviderSpec(
-            api=Api.preprocessing,
-            provider_type="inline::docling",
-            pip_packages=["docling"],
-            module="llama_stack.providers.inline.preprocessing.docling",
-            config_class="llama_stack.providers.inline.preprocessing.docling.InlineDoclingConfig",
-            api_dependencies=[],
-        ),
         InlineProviderSpec(
             api=Api.preprocessing,
             provider_type="inline::basic",