feat(api): add file_processor API skeleton (#4113)

2025-12-27 22:51:59 +00:00 · 2025-12-24 08:53:24 -05:00 · 2025-12-24 08:53:24 -05:00 · 55a1da5526
commit 55a1da5526
parent 325a0bd7b3
16 changed files with 365 additions and 0 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -13234,6 +13234,7 @@ components:
      - benchmarks
      - tool_groups
      - files
+      - file_processors
      - prompts
      - conversations
      - connectors
@ -14000,6 +14001,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
+    ProcessFileResponse:
+      description: |-
+        Response model for file processing operation.
+
+        Returns a list of chunks ready for storage in vector databases.
+        Each chunk contains the content and metadata.
+      properties:
+        chunks:
+          description: Processed chunks from the file. Always returns at least one chunk.
+          items:
+            $ref: '#/components/schemas/Chunk'
+          title: Chunks
+          type: array
+        metadata:
+          additionalProperties: true
+          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
+          title: Metadata
+          type: object
+      required:
+      - chunks
+      - metadata
+      title: ProcessFileResponse
+      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/docs/providers/file_processors/index.mdx
+++ b/docs/docs/providers/file_processors/index.mdx
@ -0,0 +1,10 @@
+---
+sidebar_label: File Processors
+title: File_Processors
+---
+
+# File_Processors
+
+## Overview
+
+This section contains documentation for all available providers for the **file_processors** API.
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -9912,6 +9912,7 @@ components:
      - benchmarks
      - tool_groups
      - files
+      - file_processors
      - prompts
      - conversations
      - connectors
@ -10678,6 +10679,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
+    ProcessFileResponse:
+      description: |-
+        Response model for file processing operation.
+
+        Returns a list of chunks ready for storage in vector databases.
+        Each chunk contains the content and metadata.
+      properties:
+        chunks:
+          description: Processed chunks from the file. Always returns at least one chunk.
+          items:
+            $ref: '#/components/schemas/Chunk'
+          title: Chunks
+          type: array
+        metadata:
+          additionalProperties: true
+          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
+          title: Metadata
+          type: object
+      required:
+      - chunks
+      - metadata
+      title: ProcessFileResponse
+      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -8986,6 +8986,7 @@ components:
      - benchmarks
      - tool_groups
      - files
+      - file_processors
      - prompts
      - conversations
      - connectors
@ -9752,6 +9753,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
+    ProcessFileResponse:
+      description: |-
+        Response model for file processing operation.
+
+        Returns a list of chunks ready for storage in vector databases.
+        Each chunk contains the content and metadata.
+      properties:
+        chunks:
+          description: Processed chunks from the file. Always returns at least one chunk.
+          items:
+            $ref: '#/components/schemas/Chunk'
+          title: Chunks
+          type: array
+        metadata:
+          additionalProperties: true
+          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
+          title: Metadata
+          type: object
+      required:
+      - chunks
+      - metadata
+      title: ProcessFileResponse
+      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -11407,6 +11407,7 @@ components:
      - benchmarks
      - tool_groups
      - files
+      - file_processors
      - prompts
      - conversations
      - connectors
@ -12170,6 +12171,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
+    ProcessFileResponse:
+      description: |-
+        Response model for file processing operation.
+
+        Returns a list of chunks ready for storage in vector databases.
+        Each chunk contains the content and metadata.
+      properties:
+        chunks:
+          description: Processed chunks from the file. Always returns at least one chunk.
+          items:
+            $ref: '#/components/schemas/Chunk'
+          title: Chunks
+          type: array
+        metadata:
+          additionalProperties: true
+          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
+          title: Metadata
+          type: object
+      required:
+      - chunks
+      - metadata
+      title: ProcessFileResponse
+      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -13234,6 +13234,7 @@ components:
      - benchmarks
      - tool_groups
      - files
+      - file_processors
      - prompts
      - conversations
      - connectors
@ -14000,6 +14001,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
+    ProcessFileResponse:
+      description: |-
+        Response model for file processing operation.
+
+        Returns a list of chunks ready for storage in vector databases.
+        Each chunk contains the content and metadata.
+      properties:
+        chunks:
+          description: Processed chunks from the file. Always returns at least one chunk.
+          items:
+            $ref: '#/components/schemas/Chunk'
+          title: Chunks
+          type: array
+        metadata:
+          additionalProperties: true
+          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
+          title: Metadata
+          type: object
+      required:
+      - chunks
+      - metadata
+      title: ProcessFileResponse
+      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -37,6 +37,7 @@ from llama_stack_api import (
    DatasetsProtocolPrivate,
    Eval,
    ExternalApiSpec,
+    FileProcessors,
    Files,
    Inference,
    InferenceProvider,
@ -104,6 +105,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.files: Files,
        Api.prompts: Prompts,
        Api.conversations: Conversations,
+        Api.file_processors: FileProcessors,
        Api.connectors: Connectors,
    }

--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@ -44,6 +44,7 @@ CATEGORIES = [
    "providers",
    "models",
    "files",
+    "file_processors",
    "vector_io",
    "tool_runtime",
    "cli",
--- a/src/llama_stack/providers/inline/file_processor/init.py
+++ b/src/llama_stack/providers/inline/file_processor/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/src/llama_stack/providers/registry/file_processors.py
+++ b/src/llama_stack/providers/registry/file_processors.py
@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack_api import ProviderSpec
+
+
+def available_providers() -> list[ProviderSpec]:
+    return []
--- a/src/llama_stack_api/init.py
+++ b/src/llama_stack_api/init.py
@ -144,6 +144,7 @@ from .datatypes import (
    VectorStoresProtocolPrivate,
 )
 from .eval import BenchmarkConfig, Eval, EvalCandidate, EvaluateResponse, ModelCandidate
+from .file_processors import FileProcessors, ProcessFileResponse
 from .files import (
    ExpiresAfter,
    Files,
@ -563,6 +564,7 @@ __all__ = [
    "ExpiresAfter",
    "ExternalApiSpec",
    "ExtraBodyField",
+    "FileProcessors",
    "Files",
    "Fp8QuantizationConfig",
    "clear_dynamic_schema_types",
@ -793,6 +795,7 @@ __all__ = [
    "ParamType",
    "parse_type",
    "PostTraining",
+    "ProcessFileResponse",
    "PostTrainingMetric",
    "PostTrainingJob",
    "PostTrainingJobArtifactsResponse",
--- a/src/llama_stack_api/datatypes.py
+++ b/src/llama_stack_api/datatypes.py
@ -110,6 +110,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar benchmarks: Benchmark suite management
    :cvar tool_groups: Tool group organization
    :cvar files: File storage and management
+    :cvar file_processors: File parsing and processing operations
    :cvar prompts: Prompt versions and management
    :cvar connectors: External connector management (e.g., MCP servers)
    :cvar inspect: Built-in system inspection and introspection
@ -135,6 +136,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"
    files = "files"
+    file_processors = "file_processors"
    prompts = "prompts"
    conversations = "conversations"
    connectors = "connectors"
--- a/src/llama_stack_api/file_processors/init.py
+++ b/src/llama_stack_api/file_processors/init.py
@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""File Processors API protocol and models.
+
+This module contains the File Processors protocol definition.
+Pydantic models are defined in llama_stack_api.file_processors.models.
+The FastAPI router is defined in llama_stack_api.file_processors.fastapi_routes.
+"""
+
+# Import fastapi_routes for router factory access
+from . import fastapi_routes
+
+# Import protocol for re-export
+from .api import FileProcessors
+
+# Import models for re-export
+from .models import ProcessFileResponse
+
+__all__ = [
+    "FileProcessors",
+    "ProcessFileResponse",
+    "fastapi_routes",
+]
--- a/src/llama_stack_api/file_processors/api.py
+++ b/src/llama_stack_api/file_processors/api.py
@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Protocol, runtime_checkable
+
+from fastapi import UploadFile
+
+from llama_stack_api.vector_io import VectorStoreChunkingStrategy
+
+from .models import ProcessFileResponse
+
+
+@runtime_checkable
+class FileProcessors(Protocol):
+    """
+    File Processor API for converting files into structured, processable content.
+
+    This API provides a flexible interface for processing various file formats
+    (PDFs, documents, images, etc.) into normalized text content that can be used for
+    vector store ingestion, RAG applications, or standalone content extraction.
+
+    The API focuses on parsing and normalization:
+    - Multiple file formats through extensible provider architecture
+    - Multipart form uploads or file ID references
+    - Configurable processing options per provider
+    - Optional chunking using provider's native capabilities
+    - Rich metadata about processing results
+
+    For embedding generation, use the chunks from this API with the separate
+    embedding API to maintain clean separation of concerns.
+
+    Future providers can extend this interface to support additional formats,
+    processing capabilities, and optimization strategies.
+    """
+
+    async def process_file(
+        self,
+        file: UploadFile | None = None,
+        file_id: str | None = None,
+        options: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> ProcessFileResponse:
+        """
+        Process a file into chunks ready for vector database storage.
+
+        This method supports two modes of operation via multipart form request:
+        1. Direct upload: Upload and process a file directly (file parameter)
+        2. File storage: Process files already uploaded to file storage (file_id parameter)
+
+        Exactly one of file or file_id must be provided.
+
+        If no chunking_strategy is provided, the entire file content is returned as a single chunk.
+        If chunking_strategy is provided, the file is split according to the strategy.
+
+        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.). Mutually exclusive with file_id.
+        :param file_id: ID of file already uploaded to file storage. Mutually exclusive with file.
+        :param options: Provider-specific processing options (e.g., OCR settings, output format).
+        :param chunking_strategy: Optional strategy for splitting content into chunks.
+        :returns: ProcessFileResponse with chunks ready for vector database storage.
+        """
+        ...
--- a/src/llama_stack_api/file_processors/fastapi_routes.py
+++ b/src/llama_stack_api/file_processors/fastapi_routes.py
@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""FastAPI router for the File Processors API.
+
+This module defines the FastAPI router for the File Processors API using standard
+FastAPI route decorators. The router is defined in the API package to keep
+all API-related code together.
+"""
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, File, Form, UploadFile
+
+from llama_stack_api.router_utils import standard_responses
+from llama_stack_api.vector_io import VectorStoreChunkingStrategy
+from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
+
+from .api import FileProcessors
+from .models import ProcessFileResponse
+
+
+def create_router(impl: FileProcessors) -> APIRouter:
+    """Create a FastAPI router for the File Processors API.
+
+    Args:
+        impl: The FileProcessors implementation instance
+
+    Returns:
+        APIRouter configured for the File Processors API
+    """
+    router = APIRouter(
+        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
+        tags=["File Processors"],
+        responses=standard_responses,
+    )
+
+    @router.post(
+        "/file-processors/process",
+        response_model=ProcessFileResponse,
+        summary="Process a file into chunks ready for vector database storage.",
+        description="Process a file into chunks ready for vector database storage. Supports direct upload via multipart form or processing files already uploaded to file storage via file_id. Exactly one of file or file_id must be provided.",
+        responses={
+            200: {"description": "The processed file chunks."},
+        },
+    )
+    async def process_file(
+        file: Annotated[
+            UploadFile | None,
+            File(description="The File object to be uploaded and processed. Mutually exclusive with file_id."),
+        ] = None,
+        file_id: Annotated[
+            str | None, Form(description="ID of file already uploaded to file storage. Mutually exclusive with file.")
+        ] = None,
+        options: Annotated[
+            dict[str, Any] | None,
+            Form(
+                description="Optional processing options. Provider-specific parameters (e.g., OCR settings, output format)."
+            ),
+        ] = None,
+        chunking_strategy: Annotated[
+            VectorStoreChunkingStrategy | None,
+            Form(description="Optional chunking strategy for splitting content into chunks."),
+        ] = None,
+    ) -> ProcessFileResponse:
+        # Pass the parameters directly to the implementation
+        # The protocol method signature expects individual parameters for multipart handling
+        return await impl.process_file(
+            file=file,
+            file_id=file_id,
+            options=options,
+            chunking_strategy=chunking_strategy,
+        )
+
+    return router
--- a/src/llama_stack_api/file_processors/models.py
+++ b/src/llama_stack_api/file_processors/models.py
@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Pydantic models for File Processors API responses.
+
+This module defines the response models for the File Processors API
+using Pydantic with Field descriptions for OpenAPI schema generation.
+
+Request models are not needed for this API since it uses multipart form data
+with individual parameters rather than a JSON request body.
+"""
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack_api.schema_utils import json_schema_type
+from llama_stack_api.vector_io import Chunk
+
+
+@json_schema_type
+class ProcessFileResponse(BaseModel):
+    """Response model for file processing operation.
+
+    Returns a list of chunks ready for storage in vector databases.
+    Each chunk contains the content and metadata.
+    """
+
+    chunks: list[Chunk] = Field(..., description="Processed chunks from the file. Always returns at least one chunk.")
+
+    metadata: dict[str, Any] = Field(
+        ...,
+        description="Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.",
+    )
+
+
+__all__ = [
+    "ProcessFileResponse",
+]