From 275fdbc23fedc788466a06f63ac0e861f777a908 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Thu, 6 Mar 2025 20:51:37 +0100 Subject: [PATCH] Fixed a few issues in the docling provider. --- .../providers/inline/preprocessing/docling/config.py | 1 - .../inline/preprocessing/docling/docling.py | 12 ++++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/inline/preprocessing/docling/config.py b/llama_stack/providers/inline/preprocessing/docling/config.py index 78a212788..5428f7649 100644 --- a/llama_stack/providers/inline/preprocessing/docling/config.py +++ b/llama_stack/providers/inline/preprocessing/docling/config.py @@ -8,4 +8,3 @@ from pydantic import BaseModel class InlineDoclingConfig(BaseModel): chunk: bool - tokenizer: str diff --git a/llama_stack/providers/inline/preprocessing/docling/docling.py b/llama_stack/providers/inline/preprocessing/docling/docling.py index 5c2641ea7..9db89806b 100644 --- a/llama_stack/providers/inline/preprocessing/docling/docling.py +++ b/llama_stack/providers/inline/preprocessing/docling/docling.py @@ -41,7 +41,8 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate async def initialize(self) -> None: if self.config.chunk: - self.chunker = HybridChunker(tokenizer=self.config.tokenizer) + # TODO: docling should use Llama Stack's inference API instead of handling tokenization by itself + self.chunker = HybridChunker() async def shutdown(self) -> None: ... @@ -69,10 +70,16 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate continue converted_document = self.converter.convert(url).document + if self.config.chunk: result = self.chunker.chunk(converted_document) for i, chunk in enumerate(result): - raw_chunk = Chunk(content=chunk.text, metadata=chunk.meta) + metadata = chunk.meta.dict() + # TODO: some vector DB adapters rely on a hard-coded header 'document_id'. This should be fixed. + metadata["document_id"] = inp.data_element_id + # TODO: the RAG tool implementation relies in a hard-coded header 'token_count' + metadata["token_count"] = self.chunker._count_chunk_tokens(chunk) + raw_chunk = Chunk(content=chunk.text, metadata=metadata) chunk_data_element = PreprocessingDataElement( data_element_id=f"{inp.data_element_id}_chunk_{i}", data_element_type=PreprocessingDataType.chunks, @@ -80,6 +87,7 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate data_element_path_or_content=raw_chunk, ) results.append(chunk_data_element) + else: result = PreprocessingDataElement( data_element_id=inp.data_element_id,