mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 19:12:09 +00:00
Initial and very basic docling support.
This commit is contained in:
parent
aa1b670d5c
commit
c3515530bb
3 changed files with 38 additions and 5 deletions
|
@ -27,8 +27,11 @@ class PreprocessingInput(BaseModel):
|
||||||
|
|
||||||
PreprocessorOptions = Dict[str, Any]
|
PreprocessorOptions = Dict[str, Any]
|
||||||
|
|
||||||
# TODO: shouldn't be just a string
|
|
||||||
PreprocessingResult = str
|
@json_schema_type
|
||||||
|
class PreprocessingResult(BaseModel):
|
||||||
|
metadata: dict[str, Any]
|
||||||
|
data: str
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
@ -6,4 +6,6 @@
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class InlineDoclingConfig(BaseModel): ...
|
class InlineDoclingConfig(BaseModel):
|
||||||
|
chunk: bool
|
||||||
|
tokenizer: str
|
||||||
|
|
|
@ -5,10 +5,15 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
||||||
|
|
||||||
|
from llama_stack.apis.common.content_types import URL
|
||||||
from llama_stack.apis.preprocessing import (
|
from llama_stack.apis.preprocessing import (
|
||||||
Preprocessing,
|
Preprocessing,
|
||||||
PreprocessingInput,
|
PreprocessingInput,
|
||||||
PreprocessingResponse,
|
PreprocessingResponse,
|
||||||
|
PreprocessingResult,
|
||||||
Preprocessor,
|
Preprocessor,
|
||||||
PreprocessorOptions,
|
PreprocessorOptions,
|
||||||
)
|
)
|
||||||
|
@ -19,8 +24,12 @@ from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConf
|
||||||
class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
|
class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
|
||||||
def __init__(self, config: InlineDoclingConfig) -> None:
|
def __init__(self, config: InlineDoclingConfig) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.converter = DocumentConverter()
|
||||||
|
self.chunker = None
|
||||||
|
|
||||||
async def initialize(self) -> None: ...
|
async def initialize(self) -> None:
|
||||||
|
if self.config.chunk:
|
||||||
|
self.chunker = HybridChunker(tokenizer=self.config.tokenizer)
|
||||||
|
|
||||||
async def shutdown(self) -> None: ...
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
|
@ -33,4 +42,23 @@ class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate
|
||||||
preprocessor_id: str,
|
preprocessor_id: str,
|
||||||
preprocessor_inputs: List[PreprocessingInput],
|
preprocessor_inputs: List[PreprocessingInput],
|
||||||
options: PreprocessorOptions,
|
options: PreprocessorOptions,
|
||||||
) -> PreprocessingResponse: ...
|
) -> PreprocessingResponse:
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for inp in preprocessor_inputs:
|
||||||
|
if isinstance(inp.path_or_content, str):
|
||||||
|
url = inp.path_or_content
|
||||||
|
elif isinstance(inp.path_or_content, URL):
|
||||||
|
url = inp.path_or_content.uri
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected type {type(inp.path_or_content)} for input {inp.path_or_content}")
|
||||||
|
|
||||||
|
converted_document = self.converter.convert(url).document
|
||||||
|
if self.config.chunk:
|
||||||
|
result = self.chunker.chunk(converted_document)
|
||||||
|
results.extend([PreprocessingResult(data=chunk.text, metadata=chunk.meta) for chunk in result])
|
||||||
|
else:
|
||||||
|
result = converted_document.export_to_markdown()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return PreprocessingResponse(status=True, results=results)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue