feat(api): add file_processor API skeleton (#4113)

2025-12-28 01:11:59 +00:00 · 2025-12-24 08:53:24 -05:00 · 2025-12-24 08:53:24 -05:00 · 55a1da5526
commit 55a1da5526
parent 325a0bd7b3
16 changed files with 365 additions and 0 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -13234,6 +13234,7 @@ components:
      - benchmarks
      - tool_groups
      - files
      - file_processors
      - prompts
      - conversations
      - connectors
@ -14000,6 +14001,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
    ProcessFileResponse:
      description: |-
        Response model for file processing operation.
        Returns a list of chunks ready for storage in vector databases.
        Each chunk contains the content and metadata.
      properties:
        chunks:
          description: Processed chunks from the file. Always returns at least one chunk.
          items:
            $ref: '#/components/schemas/Chunk'
          title: Chunks
          type: array
        metadata:
          additionalProperties: true
          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
          title: Metadata
          type: object
      required:
      - chunks
      - metadata
      title: ProcessFileResponse
      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/docs/providers/file_processors/index.mdx
+++ b/docs/docs/providers/file_processors/index.mdx
@ -0,0 +1,10 @@
 ---
 sidebar_label: File Processors
 title: File_Processors
 ---
 # File_Processors
 ## Overview
 This section contains documentation for all available providers for the **file_processors** API.
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -9912,6 +9912,7 @@ components:
      - benchmarks
      - tool_groups
      - files
      - file_processors
      - prompts
      - conversations
      - connectors
@ -10678,6 +10679,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
    ProcessFileResponse:
      description: |-
        Response model for file processing operation.
        Returns a list of chunks ready for storage in vector databases.
        Each chunk contains the content and metadata.
      properties:
        chunks:
          description: Processed chunks from the file. Always returns at least one chunk.
          items:
            $ref: '#/components/schemas/Chunk'
          title: Chunks
          type: array
        metadata:
          additionalProperties: true
          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
          title: Metadata
          type: object
      required:
      - chunks
      - metadata
      title: ProcessFileResponse
      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -8986,6 +8986,7 @@ components:
      - benchmarks
      - tool_groups
      - files
      - file_processors
      - prompts
      - conversations
      - connectors
@ -9752,6 +9753,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
    ProcessFileResponse:
      description: |-
        Response model for file processing operation.
        Returns a list of chunks ready for storage in vector databases.
        Each chunk contains the content and metadata.
      properties:
        chunks:
          description: Processed chunks from the file. Always returns at least one chunk.
          items:
            $ref: '#/components/schemas/Chunk'
          title: Chunks
          type: array
        metadata:
          additionalProperties: true
          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
          title: Metadata
          type: object
      required:
      - chunks
      - metadata
      title: ProcessFileResponse
      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -11407,6 +11407,7 @@ components:
      - benchmarks
      - tool_groups
      - files
      - file_processors
      - prompts
      - conversations
      - connectors
@ -12170,6 +12171,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
    ProcessFileResponse:
      description: |-
        Response model for file processing operation.
        Returns a list of chunks ready for storage in vector databases.
        Each chunk contains the content and metadata.
      properties:
        chunks:
          description: Processed chunks from the file. Always returns at least one chunk.
          items:
            $ref: '#/components/schemas/Chunk'
          title: Chunks
          type: array
        metadata:
          additionalProperties: true
          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
          title: Metadata
          type: object
      required:
      - chunks
      - metadata
      title: ProcessFileResponse
      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -13234,6 +13234,7 @@ components:
      - benchmarks
      - tool_groups
      - files
      - file_processors
      - prompts
      - conversations
      - connectors
@ -14000,6 +14001,29 @@ components:
      - items
      title: ConversationItemCreateRequest
      type: object
    ProcessFileResponse:
      description: |-
        Response model for file processing operation.
        Returns a list of chunks ready for storage in vector databases.
        Each chunk contains the content and metadata.
      properties:
        chunks:
          description: Processed chunks from the file. Always returns at least one chunk.
          items:
            $ref: '#/components/schemas/Chunk'
          title: Chunks
          type: array
        metadata:
          additionalProperties: true
          description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.
          title: Metadata
          type: object
      required:
      - chunks
      - metadata
      title: ProcessFileResponse
      type: object
    PostTrainingJobLogStream:
      description: Stream of logs from a finetuning job.
      properties:
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -37,6 +37,7 @@ from llama_stack_api import (
    DatasetsProtocolPrivate,
    Eval,
    ExternalApiSpec,
    FileProcessors,
    Files,
    Inference,
    InferenceProvider,
@ -104,6 +105,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.files: Files,
        Api.prompts: Prompts,
        Api.conversations: Conversations,
        Api.file_processors: FileProcessors,
        Api.connectors: Connectors,
    }
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@ -44,6 +44,7 @@ CATEGORIES = [
    "providers",
    "models",
    "files",
    "file_processors",
    "vector_io",
    "tool_runtime",
    "cli",
--- a/src/llama_stack/providers/inline/file_processor/init.py
+++ b/src/llama_stack/providers/inline/file_processor/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/providers/registry/file_processors.py
+++ b/src/llama_stack/providers/registry/file_processors.py
@ -0,0 +1,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack_api import ProviderSpec
 def available_providers() -> list[ProviderSpec]:
    return []
--- a/src/llama_stack_api/init.py
+++ b/src/llama_stack_api/init.py
@ -144,6 +144,7 @@ from .datatypes import (
    VectorStoresProtocolPrivate,
 )
 from .eval import BenchmarkConfig, Eval, EvalCandidate, EvaluateResponse, ModelCandidate
 from .file_processors import FileProcessors, ProcessFileResponse
 from .files import (
    ExpiresAfter,
    Files,
@ -563,6 +564,7 @@ __all__ = [
    "ExpiresAfter",
    "ExternalApiSpec",
    "ExtraBodyField",
    "FileProcessors",
    "Files",
    "Fp8QuantizationConfig",
    "clear_dynamic_schema_types",
@ -793,6 +795,7 @@ __all__ = [
    "ParamType",
    "parse_type",
    "PostTraining",
    "ProcessFileResponse",
    "PostTrainingMetric",
    "PostTrainingJob",
    "PostTrainingJobArtifactsResponse",
--- a/src/llama_stack_api/datatypes.py
+++ b/src/llama_stack_api/datatypes.py
@ -110,6 +110,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar benchmarks: Benchmark suite management
    :cvar tool_groups: Tool group organization
    :cvar files: File storage and management
    :cvar file_processors: File parsing and processing operations
    :cvar prompts: Prompt versions and management
    :cvar connectors: External connector management (e.g., MCP servers)
    :cvar inspect: Built-in system inspection and introspection
@ -135,6 +136,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"
    files = "files"
    file_processors = "file_processors"
    prompts = "prompts"
    conversations = "conversations"
    connectors = "connectors"
--- a/src/llama_stack_api/file_processors/init.py
+++ b/src/llama_stack_api/file_processors/init.py
@ -0,0 +1,27 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """File Processors API protocol and models.
 This module contains the File Processors protocol definition.
 Pydantic models are defined in llama_stack_api.file_processors.models.
 The FastAPI router is defined in llama_stack_api.file_processors.fastapi_routes.
 """
 # Import fastapi_routes for router factory access
 from . import fastapi_routes
 # Import protocol for re-export
 from .api import FileProcessors
 # Import models for re-export
 from .models import ProcessFileResponse
 __all__ = [
    "FileProcessors",
    "ProcessFileResponse",
    "fastapi_routes",
 ]
--- a/src/llama_stack_api/file_processors/api.py
+++ b/src/llama_stack_api/file_processors/api.py
@ -0,0 +1,64 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Protocol, runtime_checkable
 from fastapi import UploadFile
 from llama_stack_api.vector_io import VectorStoreChunkingStrategy
 from .models import ProcessFileResponse
@runtime_checkable
 class FileProcessors(Protocol):
    """
    File Processor API for converting files into structured, processable content.
    This API provides a flexible interface for processing various file formats
    (PDFs, documents, images, etc.) into normalized text content that can be used for
    vector store ingestion, RAG applications, or standalone content extraction.
    The API focuses on parsing and normalization:
    - Multiple file formats through extensible provider architecture
    - Multipart form uploads or file ID references
    - Configurable processing options per provider
    - Optional chunking using provider's native capabilities
    - Rich metadata about processing results
    For embedding generation, use the chunks from this API with the separate
    embedding API to maintain clean separation of concerns.
    Future providers can extend this interface to support additional formats,
    processing capabilities, and optimization strategies.
    """
    async def process_file(
        self,
        file: UploadFile | None = None,
        file_id: str | None = None,
        options: dict[str, Any] | None = None,
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> ProcessFileResponse:
        """
        Process a file into chunks ready for vector database storage.
        This method supports two modes of operation via multipart form request:
        1. Direct upload: Upload and process a file directly (file parameter)
        2. File storage: Process files already uploaded to file storage (file_id parameter)
        Exactly one of file or file_id must be provided.
        If no chunking_strategy is provided, the entire file content is returned as a single chunk.
        If chunking_strategy is provided, the file is split according to the strategy.
        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.). Mutually exclusive with file_id.
        :param file_id: ID of file already uploaded to file storage. Mutually exclusive with file.
        :param options: Provider-specific processing options (e.g., OCR settings, output format).
        :param chunking_strategy: Optional strategy for splitting content into chunks.
        :returns: ProcessFileResponse with chunks ready for vector database storage.
        """
        ...
--- a/src/llama_stack_api/file_processors/fastapi_routes.py
+++ b/src/llama_stack_api/file_processors/fastapi_routes.py
@ -0,0 +1,78 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """FastAPI router for the File Processors API.
 This module defines the FastAPI router for the File Processors API using standard
 FastAPI route decorators. The router is defined in the API package to keep
 all API-related code together.
 """
 from typing import Annotated, Any
 from fastapi import APIRouter, File, Form, UploadFile
 from llama_stack_api.router_utils import standard_responses
 from llama_stack_api.vector_io import VectorStoreChunkingStrategy
 from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
 from .api import FileProcessors
 from .models import ProcessFileResponse
 def create_router(impl: FileProcessors) -> APIRouter:
    """Create a FastAPI router for the File Processors API.
    Args:
        impl: The FileProcessors implementation instance
    Returns:
        APIRouter configured for the File Processors API
    """
    router = APIRouter(
        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
        tags=["File Processors"],
        responses=standard_responses,
    )
    @router.post(
        "/file-processors/process",
        response_model=ProcessFileResponse,
        summary="Process a file into chunks ready for vector database storage.",
        description="Process a file into chunks ready for vector database storage. Supports direct upload via multipart form or processing files already uploaded to file storage via file_id. Exactly one of file or file_id must be provided.",
        responses={
            200: {"description": "The processed file chunks."},
        },
    )
    async def process_file(
        file: Annotated[
            UploadFile | None,
            File(description="The File object to be uploaded and processed. Mutually exclusive with file_id."),
        ] = None,
        file_id: Annotated[
            str | None, Form(description="ID of file already uploaded to file storage. Mutually exclusive with file.")
        ] = None,
        options: Annotated[
            dict[str, Any] | None,
            Form(
                description="Optional processing options. Provider-specific parameters (e.g., OCR settings, output format)."
            ),
        ] = None,
        chunking_strategy: Annotated[
            VectorStoreChunkingStrategy | None,
            Form(description="Optional chunking strategy for splitting content into chunks."),
        ] = None,
    ) -> ProcessFileResponse:
        # Pass the parameters directly to the implementation
        # The protocol method signature expects individual parameters for multipart handling
        return await impl.process_file(
            file=file,
            file_id=file_id,
            options=options,
            chunking_strategy=chunking_strategy,
        )
    return router
--- a/src/llama_stack_api/file_processors/models.py
+++ b/src/llama_stack_api/file_processors/models.py
@ -0,0 +1,42 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """Pydantic models for File Processors API responses.
 This module defines the response models for the File Processors API
 using Pydantic with Field descriptions for OpenAPI schema generation.
 Request models are not needed for this API since it uses multipart form data
 with individual parameters rather than a JSON request body.
 """
 from typing import Any
 from pydantic import BaseModel, Field
 from llama_stack_api.schema_utils import json_schema_type
 from llama_stack_api.vector_io import Chunk
@json_schema_type
 class ProcessFileResponse(BaseModel):
    """Response model for file processing operation.
    Returns a list of chunks ready for storage in vector databases.
    Each chunk contains the content and metadata.
    """
    chunks: list[Chunk] = Field(..., description="Processed chunks from the file. Always returns at least one chunk.")
    metadata: dict[str, Any] = Field(
        ...,
        description="Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.",
    )
 __all__ = [
    "ProcessFileResponse",
 ]