mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-10 04:08:31 +00:00
Fixed a few issues in the docling provider.
This commit is contained in:
parent
f10a412898
commit
275fdbc23f
2 changed files with 10 additions and 3 deletions
|
@ -8,4 +8,3 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
class InlineDoclingConfig(BaseModel):
|
class InlineDoclingConfig(BaseModel):
|
||||||
chunk: bool
|
chunk: bool
|
||||||
tokenizer: str
|
|
||||||
|
|
|
@ -41,7 +41,8 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
if self.config.chunk:
|
if self.config.chunk:
|
||||||
self.chunker = HybridChunker(tokenizer=self.config.tokenizer)
|
# TODO: docling should use Llama Stack's inference API instead of handling tokenization by itself
|
||||||
|
self.chunker = HybridChunker()
|
||||||
|
|
||||||
async def shutdown(self) -> None: ...
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
|
@ -69,10 +70,16 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
|
||||||
continue
|
continue
|
||||||
|
|
||||||
converted_document = self.converter.convert(url).document
|
converted_document = self.converter.convert(url).document
|
||||||
|
|
||||||
if self.config.chunk:
|
if self.config.chunk:
|
||||||
result = self.chunker.chunk(converted_document)
|
result = self.chunker.chunk(converted_document)
|
||||||
for i, chunk in enumerate(result):
|
for i, chunk in enumerate(result):
|
||||||
raw_chunk = Chunk(content=chunk.text, metadata=chunk.meta)
|
metadata = chunk.meta.dict()
|
||||||
|
# TODO: some vector DB adapters rely on a hard-coded header 'document_id'. This should be fixed.
|
||||||
|
metadata["document_id"] = inp.data_element_id
|
||||||
|
# TODO: the RAG tool implementation relies in a hard-coded header 'token_count'
|
||||||
|
metadata["token_count"] = self.chunker._count_chunk_tokens(chunk)
|
||||||
|
raw_chunk = Chunk(content=chunk.text, metadata=metadata)
|
||||||
chunk_data_element = PreprocessingDataElement(
|
chunk_data_element = PreprocessingDataElement(
|
||||||
data_element_id=f"{inp.data_element_id}_chunk_{i}",
|
data_element_id=f"{inp.data_element_id}_chunk_{i}",
|
||||||
data_element_type=PreprocessingDataType.chunks,
|
data_element_type=PreprocessingDataType.chunks,
|
||||||
|
@ -80,6 +87,7 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
|
||||||
data_element_path_or_content=raw_chunk,
|
data_element_path_or_content=raw_chunk,
|
||||||
)
|
)
|
||||||
results.append(chunk_data_element)
|
results.append(chunk_data_element)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
result = PreprocessingDataElement(
|
result = PreprocessingDataElement(
|
||||||
data_element_id=inp.data_element_id,
|
data_element_id=inp.data_element_id,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue