diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 485acd560..ca3bce31d 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -13234,6 +13234,7 @@ components: - benchmarks - tool_groups - files + - file_processors - prompts - conversations - connectors @@ -14000,6 +14001,29 @@ components: - items title: ConversationItemCreateRequest type: object + ProcessFileResponse: + description: |- + Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + properties: + chunks: + description: Processed chunks from the file. Always returns at least one chunk. + items: + $ref: '#/components/schemas/Chunk' + title: Chunks + type: array + metadata: + additionalProperties: true + description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields. + title: Metadata + type: object + required: + - chunks + - metadata + title: ProcessFileResponse + type: object PostTrainingJobLogStream: description: Stream of logs from a finetuning job. properties: diff --git a/docs/docs/providers/file_processors/index.mdx b/docs/docs/providers/file_processors/index.mdx new file mode 100644 index 000000000..e9119d40e --- /dev/null +++ b/docs/docs/providers/file_processors/index.mdx @@ -0,0 +1,10 @@ +--- +sidebar_label: File Processors +title: File_Processors +--- + +# File_Processors + +## Overview + +This section contains documentation for all available providers for the **file_processors** API. diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 26cbc07f9..ea2fc61d1 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -9912,6 +9912,7 @@ components: - benchmarks - tool_groups - files + - file_processors - prompts - conversations - connectors @@ -10678,6 +10679,29 @@ components: - items title: ConversationItemCreateRequest type: object + ProcessFileResponse: + description: |- + Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + properties: + chunks: + description: Processed chunks from the file. Always returns at least one chunk. + items: + $ref: '#/components/schemas/Chunk' + title: Chunks + type: array + metadata: + additionalProperties: true + description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields. + title: Metadata + type: object + required: + - chunks + - metadata + title: ProcessFileResponse + type: object PostTrainingJobLogStream: description: Stream of logs from a finetuning job. properties: diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index 14838ff25..5d8edf188 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -8986,6 +8986,7 @@ components: - benchmarks - tool_groups - files + - file_processors - prompts - conversations - connectors @@ -9752,6 +9753,29 @@ components: - items title: ConversationItemCreateRequest type: object + ProcessFileResponse: + description: |- + Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + properties: + chunks: + description: Processed chunks from the file. Always returns at least one chunk. + items: + $ref: '#/components/schemas/Chunk' + title: Chunks + type: array + metadata: + additionalProperties: true + description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields. + title: Metadata + type: object + required: + - chunks + - metadata + title: ProcessFileResponse + type: object PostTrainingJobLogStream: description: Stream of logs from a finetuning job. properties: diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 5cd2b684a..9ab9bcdb2 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -11407,6 +11407,7 @@ components: - benchmarks - tool_groups - files + - file_processors - prompts - conversations - connectors @@ -12170,6 +12171,29 @@ components: - items title: ConversationItemCreateRequest type: object + ProcessFileResponse: + description: |- + Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + properties: + chunks: + description: Processed chunks from the file. Always returns at least one chunk. + items: + $ref: '#/components/schemas/Chunk' + title: Chunks + type: array + metadata: + additionalProperties: true + description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields. + title: Metadata + type: object + required: + - chunks + - metadata + title: ProcessFileResponse + type: object PostTrainingJobLogStream: description: Stream of logs from a finetuning job. properties: diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 485acd560..ca3bce31d 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -13234,6 +13234,7 @@ components: - benchmarks - tool_groups - files + - file_processors - prompts - conversations - connectors @@ -14000,6 +14001,29 @@ components: - items title: ConversationItemCreateRequest type: object + ProcessFileResponse: + description: |- + Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + properties: + chunks: + description: Processed chunks from the file. Always returns at least one chunk. + items: + $ref: '#/components/schemas/Chunk' + title: Chunks + type: array + metadata: + additionalProperties: true + description: Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields. + title: Metadata + type: object + required: + - chunks + - metadata + title: ProcessFileResponse + type: object PostTrainingJobLogStream: description: Stream of logs from a finetuning job. properties: diff --git a/src/llama_stack/core/resolver.py b/src/llama_stack/core/resolver.py index 3a8bfe8c7..a0f6961d4 100644 --- a/src/llama_stack/core/resolver.py +++ b/src/llama_stack/core/resolver.py @@ -37,6 +37,7 @@ from llama_stack_api import ( DatasetsProtocolPrivate, Eval, ExternalApiSpec, + FileProcessors, Files, Inference, InferenceProvider, @@ -104,6 +105,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) -> Api.files: Files, Api.prompts: Prompts, Api.conversations: Conversations, + Api.file_processors: FileProcessors, Api.connectors: Connectors, } diff --git a/src/llama_stack/log.py b/src/llama_stack/log.py index 709c7e0e0..4b3847f16 100644 --- a/src/llama_stack/log.py +++ b/src/llama_stack/log.py @@ -44,6 +44,7 @@ CATEGORIES = [ "providers", "models", "files", + "file_processors", "vector_io", "tool_runtime", "cli", diff --git a/src/llama_stack/providers/inline/file_processor/__init__.py b/src/llama_stack/providers/inline/file_processor/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/src/llama_stack/providers/inline/file_processor/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/src/llama_stack/providers/registry/file_processors.py b/src/llama_stack/providers/registry/file_processors.py new file mode 100644 index 000000000..fef58de51 --- /dev/null +++ b/src/llama_stack/providers/registry/file_processors.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack_api import ProviderSpec + + +def available_providers() -> list[ProviderSpec]: + return [] diff --git a/src/llama_stack_api/__init__.py b/src/llama_stack_api/__init__.py index 80ffb309c..727d6e711 100644 --- a/src/llama_stack_api/__init__.py +++ b/src/llama_stack_api/__init__.py @@ -144,6 +144,7 @@ from .datatypes import ( VectorStoresProtocolPrivate, ) from .eval import BenchmarkConfig, Eval, EvalCandidate, EvaluateResponse, ModelCandidate +from .file_processors import FileProcessors, ProcessFileResponse from .files import ( ExpiresAfter, Files, @@ -563,6 +564,7 @@ __all__ = [ "ExpiresAfter", "ExternalApiSpec", "ExtraBodyField", + "FileProcessors", "Files", "Fp8QuantizationConfig", "clear_dynamic_schema_types", @@ -793,6 +795,7 @@ __all__ = [ "ParamType", "parse_type", "PostTraining", + "ProcessFileResponse", "PostTrainingMetric", "PostTrainingJob", "PostTrainingJobArtifactsResponse", diff --git a/src/llama_stack_api/datatypes.py b/src/llama_stack_api/datatypes.py index 54b1487b7..70177c03d 100644 --- a/src/llama_stack_api/datatypes.py +++ b/src/llama_stack_api/datatypes.py @@ -110,6 +110,7 @@ class Api(Enum, metaclass=DynamicApiMeta): :cvar benchmarks: Benchmark suite management :cvar tool_groups: Tool group organization :cvar files: File storage and management + :cvar file_processors: File parsing and processing operations :cvar prompts: Prompt versions and management :cvar connectors: External connector management (e.g., MCP servers) :cvar inspect: Built-in system inspection and introspection @@ -135,6 +136,7 @@ class Api(Enum, metaclass=DynamicApiMeta): benchmarks = "benchmarks" tool_groups = "tool_groups" files = "files" + file_processors = "file_processors" prompts = "prompts" conversations = "conversations" connectors = "connectors" diff --git a/src/llama_stack_api/file_processors/__init__.py b/src/llama_stack_api/file_processors/__init__.py new file mode 100644 index 000000000..3568fe4cc --- /dev/null +++ b/src/llama_stack_api/file_processors/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +"""File Processors API protocol and models. + +This module contains the File Processors protocol definition. +Pydantic models are defined in llama_stack_api.file_processors.models. +The FastAPI router is defined in llama_stack_api.file_processors.fastapi_routes. +""" + +# Import fastapi_routes for router factory access +from . import fastapi_routes + +# Import protocol for re-export +from .api import FileProcessors + +# Import models for re-export +from .models import ProcessFileResponse + +__all__ = [ + "FileProcessors", + "ProcessFileResponse", + "fastapi_routes", +] diff --git a/src/llama_stack_api/file_processors/api.py b/src/llama_stack_api/file_processors/api.py new file mode 100644 index 000000000..ef1c1edf8 --- /dev/null +++ b/src/llama_stack_api/file_processors/api.py @@ -0,0 +1,64 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Protocol, runtime_checkable + +from fastapi import UploadFile + +from llama_stack_api.vector_io import VectorStoreChunkingStrategy + +from .models import ProcessFileResponse + + +@runtime_checkable +class FileProcessors(Protocol): + """ + File Processor API for converting files into structured, processable content. + + This API provides a flexible interface for processing various file formats + (PDFs, documents, images, etc.) into normalized text content that can be used for + vector store ingestion, RAG applications, or standalone content extraction. + + The API focuses on parsing and normalization: + - Multiple file formats through extensible provider architecture + - Multipart form uploads or file ID references + - Configurable processing options per provider + - Optional chunking using provider's native capabilities + - Rich metadata about processing results + + For embedding generation, use the chunks from this API with the separate + embedding API to maintain clean separation of concerns. + + Future providers can extend this interface to support additional formats, + processing capabilities, and optimization strategies. + """ + + async def process_file( + self, + file: UploadFile | None = None, + file_id: str | None = None, + options: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> ProcessFileResponse: + """ + Process a file into chunks ready for vector database storage. + + This method supports two modes of operation via multipart form request: + 1. Direct upload: Upload and process a file directly (file parameter) + 2. File storage: Process files already uploaded to file storage (file_id parameter) + + Exactly one of file or file_id must be provided. + + If no chunking_strategy is provided, the entire file content is returned as a single chunk. + If chunking_strategy is provided, the file is split according to the strategy. + + :param file: The uploaded file object containing content and metadata (filename, content_type, etc.). Mutually exclusive with file_id. + :param file_id: ID of file already uploaded to file storage. Mutually exclusive with file. + :param options: Provider-specific processing options (e.g., OCR settings, output format). + :param chunking_strategy: Optional strategy for splitting content into chunks. + :returns: ProcessFileResponse with chunks ready for vector database storage. + """ + ... diff --git a/src/llama_stack_api/file_processors/fastapi_routes.py b/src/llama_stack_api/file_processors/fastapi_routes.py new file mode 100644 index 000000000..02541ae11 --- /dev/null +++ b/src/llama_stack_api/file_processors/fastapi_routes.py @@ -0,0 +1,78 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +"""FastAPI router for the File Processors API. + +This module defines the FastAPI router for the File Processors API using standard +FastAPI route decorators. The router is defined in the API package to keep +all API-related code together. +""" + +from typing import Annotated, Any + +from fastapi import APIRouter, File, Form, UploadFile + +from llama_stack_api.router_utils import standard_responses +from llama_stack_api.vector_io import VectorStoreChunkingStrategy +from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA + +from .api import FileProcessors +from .models import ProcessFileResponse + + +def create_router(impl: FileProcessors) -> APIRouter: + """Create a FastAPI router for the File Processors API. + + Args: + impl: The FileProcessors implementation instance + + Returns: + APIRouter configured for the File Processors API + """ + router = APIRouter( + prefix=f"/{LLAMA_STACK_API_V1ALPHA}", + tags=["File Processors"], + responses=standard_responses, + ) + + @router.post( + "/file-processors/process", + response_model=ProcessFileResponse, + summary="Process a file into chunks ready for vector database storage.", + description="Process a file into chunks ready for vector database storage. Supports direct upload via multipart form or processing files already uploaded to file storage via file_id. Exactly one of file or file_id must be provided.", + responses={ + 200: {"description": "The processed file chunks."}, + }, + ) + async def process_file( + file: Annotated[ + UploadFile | None, + File(description="The File object to be uploaded and processed. Mutually exclusive with file_id."), + ] = None, + file_id: Annotated[ + str | None, Form(description="ID of file already uploaded to file storage. Mutually exclusive with file.") + ] = None, + options: Annotated[ + dict[str, Any] | None, + Form( + description="Optional processing options. Provider-specific parameters (e.g., OCR settings, output format)." + ), + ] = None, + chunking_strategy: Annotated[ + VectorStoreChunkingStrategy | None, + Form(description="Optional chunking strategy for splitting content into chunks."), + ] = None, + ) -> ProcessFileResponse: + # Pass the parameters directly to the implementation + # The protocol method signature expects individual parameters for multipart handling + return await impl.process_file( + file=file, + file_id=file_id, + options=options, + chunking_strategy=chunking_strategy, + ) + + return router diff --git a/src/llama_stack_api/file_processors/models.py b/src/llama_stack_api/file_processors/models.py new file mode 100644 index 000000000..d2b9ecf1c --- /dev/null +++ b/src/llama_stack_api/file_processors/models.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +"""Pydantic models for File Processors API responses. + +This module defines the response models for the File Processors API +using Pydantic with Field descriptions for OpenAPI schema generation. + +Request models are not needed for this API since it uses multipart form data +with individual parameters rather than a JSON request body. +""" + +from typing import Any + +from pydantic import BaseModel, Field + +from llama_stack_api.schema_utils import json_schema_type +from llama_stack_api.vector_io import Chunk + + +@json_schema_type +class ProcessFileResponse(BaseModel): + """Response model for file processing operation. + + Returns a list of chunks ready for storage in vector databases. + Each chunk contains the content and metadata. + """ + + chunks: list[Chunk] = Field(..., description="Processed chunks from the file. Always returns at least one chunk.") + + metadata: dict[str, Any] = Field( + ..., + description="Processing-run metadata such as processor name/version, processing_time_ms, page_count, extraction_method (e.g. docling/pypdf/ocr), confidence scores, plus provider-specific fields.", + ) + + +__all__ = [ + "ProcessFileResponse", +]