mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 19:12:09 +00:00
Initial and very basic docling support.
This commit is contained in:
parent
aa1b670d5c
commit
c3515530bb
3 changed files with 38 additions and 5 deletions
|
@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel):
|
|||
|
||||
PreprocessorOptions = Dict[str, Any]
|
||||
|
||||
# TODO: shouldn't be just a string
|
||||
PreprocessingResult = str
|
||||
|
||||
@json_schema_type
|
||||
class PreprocessingResult(BaseModel):
|
||||
metadata: dict[str, Any]
|
||||
data: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
@ -6,4 +6,6 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class InlineDoclingConfig(BaseModel): ...
|
||||
class InlineDoclingConfig(BaseModel):
|
||||
chunk: bool
|
||||
tokenizer: str
|
||||
|
|
|
@ -5,10 +5,15 @@
|
|||
# the root directory of this source tree.
|
||||
from typing import List
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
||||
|
||||
from llama_stack.apis.common.content_types import URL
|
||||
from llama_stack.apis.preprocessing import (
|
||||
Preprocessing,
|
||||
PreprocessingInput,
|
||||
PreprocessingResponse,
|
||||
PreprocessingResult,
|
||||
Preprocessor,
|
||||
PreprocessorOptions,
|
||||
)
|
||||
|
@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf
|
|||
class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
|
||||
def __init__(self, config: InlineDoclingConfig) -> None:
|
||||
self.config = config
|
||||
self.converter = DocumentConverter()
|
||||
self.chunker = None
|
||||
|
||||
async def initialize(self) -> None: ...
|
||||
async def initialize(self) -> None:
|
||||
if self.config.chunk:
|
||||
self.chunker = HybridChunker(tokenizer=self.config.tokenizer)
|
||||
|
||||
async def shutdown(self) -> None: ...
|
||||
|
||||
|
@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
|
|||
preprocessor_id: str,
|
||||
preprocessor_inputs: List[PreprocessingInput],
|
||||
options: PreprocessorOptions,
|
||||
) -> PreprocessingResponse: ...
|
||||
) -> PreprocessingResponse:
|
||||
results = []
|
||||
|
||||
for inp in preprocessor_inputs:
|
||||
if isinstance(inp.path_or_content, str):
|
||||
url = inp.path_or_content
|
||||
elif isinstance(inp.path_or_content, URL):
|
||||
url = inp.path_or_content.uri
|
||||
else:
|
||||
raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
|
||||
|
||||
converted_document = self.converter.convert(url).document
|
||||
if self.config.chunk:
|
||||
result = self.chunker.chunk(converted_document)
|
||||
results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result])
|
||||
else:
|
||||
result = converted_document.export_to_markdown()
|
||||
results.append(result)
|
||||
|
||||
return PreprocessingResponse(status=True, results=results)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue