llama-stack-mirror/llama_stack/apis/preprocessing/preprocessing.py
2025-04-03 11:14:11 +02:00

78 lines
2.1 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from enum import Enum
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from pydantic import BaseModel
from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.preprocessors.preprocessors import Preprocessor
from llama_stack.apis.vector_io import Chunk
from llama_stack.schema_utils import json_schema_type, webmethod
class PreprocessingDataType(Enum):
document_uri = "document_uri"
document_directory_uri = "document_directory_uri"
binary_document = "binary_document"
raw_text_document = "raw_text_document"
chunks = "chunks"
class PreprocessingDataFormat(Enum):
pdf = "pdf"
docx = "docx"
xlsx = "xlsx"
pptx = "pptx"
md = "md"
json = "json"
html = "html"
csv = "csv"
txt = "txt"
@json_schema_type
class PreprocessingDataElement(BaseModel):
data_element_id: str
data_element_type: Optional[PreprocessingDataType] = None
data_element_format: Optional[PreprocessingDataFormat] = None
data_element_path_or_content: str | InterleavedContent | URL | Chunk | None
PreprocessorOptions = Dict[str, Any]
@json_schema_type
class PreprocessorChainElement(BaseModel):
preprocessor_id: str
options: Optional[PreprocessorOptions] = None
PreprocessorChain = List[PreprocessorChainElement]
@json_schema_type
class PreprocessorResponse(BaseModel):
success: bool
output_data_type: PreprocessingDataType | None
results: Optional[List[PreprocessingDataElement]] = None
class PreprocessorStore(Protocol):
def get_preprocessor(self, preprocessor_id: str) -> Preprocessor: ...
@runtime_checkable
class Preprocessing(Protocol):
preprocessor_store: PreprocessorStore | None
@webmethod(route="/preprocess", method="POST")
async def preprocess(
self,
preprocessors: PreprocessorChain,
preprocessor_inputs: List[PreprocessingDataElement],
) -> PreprocessorResponse: ...