The first draft of the Preprocessing API.

2025-08-07 19:12:09 +00:00 · 2025-03-03 13:32:17 +01:00 · 2025-03-03 13:32:17 +01:00 · aa1b670d5c
commit aa1b670d5c
parent 7f9b767277
18 changed files with 327 additions and 0 deletions
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -20,6 +20,7 @@ class Api(Enum):
    eval = "eval"
    post_training = "post_training"
    tool_runtime = "tool_runtime"
    preprocessing = "preprocessing"
    telemetry = "telemetry"
@ -30,6 +31,7 @@ class Api(Enum):
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"
    preprocessors = "preprocessors"
    # built-in API
    inspect = "inspect"
--- a/llama_stack/apis/preprocessing/init.py
+++ b/llama_stack/apis/preprocessing/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .preprocessing import *  # noqa: F401 F403
--- a/llama_stack/apis/preprocessing/preprocessing.py
+++ b/llama_stack/apis/preprocessing/preprocessing.py
@ -0,0 +1,54 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from enum import Enum
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 from pydantic import BaseModel
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.preprocessing.preprocessors import Preprocessor
 from llama_stack.schema_utils import json_schema_type, webmethod
 class PreprocessingInputType(Enum):
    document_content = "document_content"
    document_path = "document_path"
@json_schema_type
 class PreprocessingInput(BaseModel):
    preprocessor_input_id: str
    preprocessor_input_type: Optional[PreprocessingInputType]
    path_or_content: str | URL
 PreprocessorOptions = Dict[str, Any]
 # TODO: shouldn't be just a string
 PreprocessingResult = str
@json_schema_type
 class PreprocessingResponse(BaseModel):
    status: bool
    results: Optional[List[str | PreprocessingResult]]
 class PreprocessorStore(Protocol):
    def get_preprocessor(self, preprocessor_id: str) -> Preprocessor: ...
@runtime_checkable
 class Preprocessing(Protocol):
    preprocessor_store: PreprocessorStore
    @webmethod(route="/preprocess", method="POST")
    async def preprocess(
        self,
        preprocessor_id: str,
        preprocessor_inputs: List[PreprocessingInput],
        options: PreprocessorOptions,
    ) -> PreprocessingResponse: ...
--- a/llama_stack/apis/preprocessing/preprocessors.py
+++ b/llama_stack/apis/preprocessing/preprocessors.py
@ -0,0 +1,65 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
 from pydantic import BaseModel
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
 class Preprocessor(Resource):
    type: Literal[ResourceType.preprocessor.value] = ResourceType.preprocessor.value
    @property
    def preprocessor_id(self) -> str:
        return self.identifier
    @property
    def provider_preprocessor_id(self) -> str:
        return self.provider_resource_id
    metadata: Optional[Dict[str, Any]] = None
 class PreprocessorInput(BaseModel):
    preprocessor_id: str
    provider_id: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None
 class ListPreprocessorsResponse(BaseModel):
    data: List[Preprocessor]
@runtime_checkable
@trace_protocol
 class Preprocessors(Protocol):
    @webmethod(route="/preprocessors", method="GET")
    async def list_preprocessors(self) -> ListPreprocessorsResponse: ...
    @webmethod(route="/preprocessors/{preprocessor_id:path}", method="GET")
    async def get_preprocessor(
        self,
        preprocessor_id: str,
    ) -> Optional[Preprocessor]: ...
    @webmethod(route="/preprocessors", method="POST")
    async def register_preprocessor(
        self,
        preprocessor_id: str,
        provider_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Preprocessor: ...
    @webmethod(route="/preprocessors/{preprocessor_id:path}", method="DELETE")
    async def unregister_preprocessor(
        self,
        preprocessor_id: str,
    ) -> None: ...
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -18,6 +18,7 @@ class ResourceType(Enum):
    benchmark = "benchmark"
    tool = "tool"
    tool_group = "tool_group"
    preprocessor = "preprocessor"
 class Resource(BaseModel):
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -14,6 +14,8 @@ from llama_stack.apis.datasets import Dataset, DatasetInput
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.models import Model, ModelInput
 from llama_stack.apis.preprocessing import Preprocessing, Preprocessor
 from llama_stack.apis.preprocessing.preprocessors import PreprocessorInput
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
@ -40,6 +42,7 @@ RoutableObject = Union[
    Benchmark,
    Tool,
    ToolGroup,
    Preprocessor,
 ]
@ -53,6 +56,7 @@ RoutableObjectWithProvider = Annotated[
        Benchmark,
        Tool,
        ToolGroup,
        Preprocessor,
    ],
    Field(discriminator="type"),
 ]
@ -65,6 +69,7 @@ RoutedProtocol = Union[
    Scoring,
    Eval,
    ToolRuntime,
    Preprocessing,
 ]
@ -175,6 +180,7 @@ a default SQLite store will be used.""",
    scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
    benchmarks: List[BenchmarkInput] = Field(default_factory=list)
    tool_groups: List[ToolGroupInput] = Field(default_factory=list)
    preprocessors: List[PreprocessorInput] = Field(default_factory=list)
    server: ServerConfig = Field(
        default_factory=ServerConfig,
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@ -51,6 +51,10 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
            routing_table_api=Api.tool_groups,
            router_api=Api.tool_runtime,
        ),
        AutoRoutedApiInfo(
            routing_table_api=Api.preprocessors,
            router_api=Api.preprocessing,
        ),
    ]
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -17,6 +17,8 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.preprocessing import Preprocessing
 from llama_stack.apis.preprocessing.preprocessors import Preprocessors
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@ -41,6 +43,7 @@ from llama_stack.providers.datatypes import (
    DatasetsProtocolPrivate,
    InlineProviderSpec,
    ModelsProtocolPrivate,
    PreprocessorsProtocolPrivate,
    ProviderSpec,
    RemoteProviderConfig,
    RemoteProviderSpec,
@ -77,6 +80,8 @@ def api_protocol_map() -> Dict[Api, Any]:
        Api.post_training: PostTraining,
        Api.tool_groups: ToolGroups,
        Api.tool_runtime: ToolRuntime,
        Api.preprocessing: Preprocessing,
        Api.preprocessors: Preprocessors,
    }
@ -93,6 +98,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
            Api.scoring_functions,
        ),
        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
        Api.preprocessing: (PreprocessorsProtocolPrivate, Preprocessors, Api.preprocessors),
    }
--- a/llama_stack/distribution/routers/init.py
+++ b/llama_stack/distribution/routers/init.py
@ -14,6 +14,7 @@ from .routing_tables import (
    BenchmarksRoutingTable,
    DatasetsRoutingTable,
    ModelsRoutingTable,
    PreprocessorsRoutingTable,
    ScoringFunctionsRoutingTable,
    ShieldsRoutingTable,
    ToolGroupsRoutingTable,
@ -35,6 +36,7 @@ async def get_routing_table_impl(
        "scoring_functions": ScoringFunctionsRoutingTable,
        "benchmarks": BenchmarksRoutingTable,
        "tool_groups": ToolGroupsRoutingTable,
        "preprocessors": PreprocessorsRoutingTable,
    }
    if api.value not in api_to_tables:
@ -50,6 +52,7 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) ->
        DatasetIORouter,
        EvalRouter,
        InferenceRouter,
        PreprocessingRouter,
        SafetyRouter,
        ScoringRouter,
        ToolRuntimeRouter,
@ -64,6 +67,7 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) ->
        "scoring": ScoringRouter,
        "eval": EvalRouter,
        "tool_runtime": ToolRuntimeRouter,
        "preprocessing": PreprocessingRouter,
    }
    if api.value not in api_to_routers:
        raise ValueError(f"API {api.value} not found in router map")
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -34,6 +34,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import ModelType
 from llama_stack.apis.preprocessing import Preprocessing, PreprocessingInput, PreprocessingResponse, PreprocessorOptions
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.scoring import (
    ScoreBatchResponse,
@ -482,3 +483,28 @@ class ToolRuntimeRouter(ToolRuntime):
        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
    ) -> List[ToolDef]:
        return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
 class PreprocessingRouter(Preprocessing):
    def __init__(
        self,
        routing_table: RoutingTable,
    ) -> None:
        self.routing_table = routing_table
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def preprocess(
        self,
        preprocessor_id: str,
        preprocessor_inputs: List[PreprocessingInput],
        options: PreprocessorOptions,
    ) -> PreprocessingResponse:
        return await self.routing_table.get_provider_impl(preprocessor_id).preprocess(
            preprocessor_inputs=preprocessor_inputs,
            options=options,
        )
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -14,6 +14,7 @@ from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
 from llama_stack.apis.preprocessing.preprocessors import ListPreprocessorsResponse, Preprocessor, Preprocessors
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import (
    ListScoringFunctionsResponse,
@ -66,6 +67,8 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
        return await p.register_benchmark(obj)
    elif api == Api.tool_runtime:
        return await p.register_tool(obj)
    elif api == Api.preprocessing:
        return await p.register_preprocessor(obj)
    else:
        raise ValueError(f"Unknown API {api} for registering object with provider")
@ -80,6 +83,8 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_dataset(obj.identifier)
    elif api == Api.tool_runtime:
        return await p.unregister_tool(obj.identifier)
    elif api == Api.preprocessing:
        return await p.unregister_preprocessor(obj.identifier)
    else:
        raise ValueError(f"Unregister not supported for {api}")
@ -127,6 +132,8 @@ class CommonRoutingTableImpl(RoutingTable):
                p.benchmark_store = self
            elif api == Api.tool_runtime:
                p.tool_store = self
            elif api == Api.preprocessing:
                p.preprocessor_store = self
    async def shutdown(self) -> None:
        for p in self.impls_by_provider_id.values():
@ -148,6 +155,8 @@ class CommonRoutingTableImpl(RoutingTable):
                return ("Eval", "benchmark")
            elif isinstance(self, ToolGroupsRoutingTable):
                return ("Tools", "tool")
            elif isinstance(self, PreprocessorsRoutingTable):
                return ("Preprocessing", "preprocessor")
            else:
                raise ValueError("Unknown routing table type")
@ -536,3 +545,40 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
    async def shutdown(self) -> None:
        pass
 class PreprocessorsRoutingTable(CommonRoutingTableImpl, Preprocessors):
    async def list_preprocessors(self) -> ListPreprocessorsResponse:
        return ListPreprocessorsResponse(data=await self.get_all_with_type(ResourceType.preprocessor.value))
    async def get_preprocessor(self, preprocessor_id: str) -> Optional[Preprocessor]:
        return await self.get_object_by_identifier("preprocessor", preprocessor_id)
    async def register_preprocessor(
        self,
        preprocessor_id: str,
        provider_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Preprocessor:
        if provider_id is None:
            if len(self.impls_by_provider_id) == 1:
                provider_id = list(self.impls_by_provider_id.keys())[0]
            else:
                raise ValueError(
                    "No provider specified and multiple providers available. Please specify a provider_id."
                )
        preprocessor = Preprocessor(
            identifier=preprocessor_id,
            provider_resource_id=preprocessor_id,
            provider_id=provider_id,
            metadata=metadata,
        )
        preprocessor.provider_id = provider_id
        await self.register_object(preprocessor)
        return preprocessor
    async def unregister_preprocessor(self, preprocessor_id: str) -> None:
        existing_preprocessor = await self.get_preprocessor(preprocessor_id)
        if existing_preprocessor is None:
            raise ValueError(f"Preprocessor {preprocessor_id} not found")
        await self.unregister_object(existing_preprocessor)
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -24,6 +24,8 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.preprocessing import Preprocessing
 from llama_stack.apis.preprocessing.preprocessors import Preprocessors
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@ -65,6 +67,8 @@ class LlamaStack(
    ToolRuntime,
    RAGToolRuntime,
    Files,
    Preprocessing,
    Preprocessors,
 ):
    pass
@ -82,6 +86,7 @@ RESOURCES = [
    ),
    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
    ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
    ("preprocessors", Api.preprocessors, "register_preprocessor", "list_preprocessors"),
 ]
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -13,6 +13,7 @@ from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasets import Dataset
 from llama_stack.apis.datatypes import Api
 from llama_stack.apis.models import Model
 from llama_stack.apis.preprocessing import Preprocessor
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.tools import Tool
@ -58,6 +59,12 @@ class ToolsProtocolPrivate(Protocol):
    async def unregister_tool(self, tool_id: str) -> None: ...
 class PreprocessorsProtocolPrivate(Protocol):
    async def register_preprocessor(self, preprocessor: Preprocessor) -> None: ...
    async def unregister_preprocessor(self, preprocessor_id: str) -> None: ...
@json_schema_type
 class ProviderSpec(BaseModel):
    api: Api
--- a/llama_stack/providers/inline/preprocessing/init.py
+++ b/llama_stack/providers/inline/preprocessing/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/llama_stack/providers/inline/preprocessing/docling/init.py
+++ b/llama_stack/providers/inline/preprocessing/docling/init.py
@ -0,0 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .config import InlineDoclingConfig
 async def get_provider_impl(
    config: InlineDoclingConfig,
    _deps,
 ):
    from .docling import InclineDoclingPreprocessorImpl
    impl = InclineDoclingPreprocessorImpl(config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/preprocessing/docling/config.py
+++ b/llama_stack/providers/inline/preprocessing/docling/config.py
@ -0,0 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pydantic import BaseModel
 class InlineDoclingConfig(BaseModel): ...
--- a/llama_stack/providers/inline/preprocessing/docling/docling.py
+++ b/llama_stack/providers/inline/preprocessing/docling/docling.py
@ -0,0 +1,36 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import List
 from llama_stack.apis.preprocessing import (
    Preprocessing,
    PreprocessingInput,
    PreprocessingResponse,
    Preprocessor,
    PreprocessorOptions,
 )
 from llama_stack.providers.datatypes import PreprocessorsProtocolPrivate
 from llama_stack.providers.inline.preprocessing.docling import InlineDoclingConfig
 class InclineDoclingPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
    def __init__(self, config: InlineDoclingConfig) -> None:
        self.config = config
    async def initialize(self) -> None: ...
    async def shutdown(self) -> None: ...
    async def register_preprocessor(self, preprocessor: Preprocessor) -> None: ...
    async def unregister_preprocessor(self, preprocessor_id: str) -> None: ...
    async def preprocess(
        self,
        preprocessor_id: str,
        preprocessor_inputs: List[PreprocessingInput],
        options: PreprocessorOptions,
    ) -> PreprocessingResponse: ...
--- a/llama_stack/providers/registry/preprocessing.py
+++ b/llama_stack/providers/registry/preprocessing.py
@ -0,0 +1,26 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import List
 from llama_stack.providers.datatypes import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
 )
 def available_providers() -> List[ProviderSpec]:
    return [
        InlineProviderSpec(
            api=Api.preprocessing,
            provider_type="inline::docling",
            pip_packages=["docling"],
            module="llama_stack.providers.inline.preprocessing.docling",
            config_class="llama_stack.providers.inline.preprocessing.docling.InlineDoclingConfig",
            api_dependencies=[],
        ),
    ]