feat(api): add file_processor API skeleton

This change adds a file_processor API skeleton that provides a foundationfor converting files into structured content for vector store ingestionwith support for chunking strategies and optional embedding generation. Signed-off-by: Alina Ryan <aliryan@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-11-08 23:50:49 -05:00 · 2025-11-08 23:50:49 -05:00 · 2664aeee2a
commit 2664aeee2a
parent 6147321083
21 changed files with 258 additions and 0 deletions
--- a/docs/docs/providers/file_processor/index.mdx
+++ b/docs/docs/providers/file_processor/index.mdx
@ -0,0 +1,10 @@
 ---
 sidebar_label: File Processor
 title: File_Processor
 ---
 # File_Processor
 ## Overview
 This section contains documentation for all available providers for the **file_processor** API.
--- a/docs/docs/providers/file_processor/inline_reference.mdx
+++ b/docs/docs/providers/file_processor/inline_reference.mdx
@ -0,0 +1,17 @@
 ---
 description: "Reference file processor implementation (placeholder for development)"
 sidebar_label: Reference
 title: inline::reference
 ---
 # inline::reference
 ## Description
 Reference file processor implementation (placeholder for development)
 ## Sample Configuration
 ```yaml
 {}
 ```
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    files = "files"
    prompts = "prompts"
    conversations = "conversations"
    file_processor = "file_processor"
    # built-in API
    inspect = "inspect"
--- a/src/llama_stack/apis/file_processor/init.py
+++ b/src/llama_stack/apis/file_processor/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .file_processor import *
--- a/src/llama_stack/apis/file_processor/file_processor.py
+++ b/src/llama_stack/apis/file_processor/file_processor.py
@ -0,0 +1,96 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Protocol, runtime_checkable
 from pydantic import BaseModel
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy
 from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
 class ProcessFileRequest(BaseModel):
    """Request for processing a file into structured content."""
    file_data: bytes
    """Raw file data to process."""
    filename: str
    """Original filename for format detection and processing hints."""
    options: dict[str, Any] | None = None
    """Optional processing options. Provider-specific parameters."""
    chunking_strategy: VectorStoreChunkingStrategy | None = None
    """Optional chunking strategy for splitting content into chunks."""
    include_embeddings: bool = False
    """Whether to generate embeddings for chunks."""
@json_schema_type
 class ProcessedContent(BaseModel):
    """Result of file processing operation."""
    content: str
    """Extracted text content from the file."""
    chunks: list[Chunk] | None = None
    """Optional chunks if chunking strategy was provided."""
    embeddings: list[list[float]] | None = None
    """Optional embeddings for chunks if requested."""
    metadata: dict[str, Any]
    """Processing metadata including processor name, timing, and provider-specific data."""
@telemetry_traceable
@runtime_checkable
 class FileProcessor(Protocol):
    """
    File Processor API for converting files into structured, processable content.
    This API provides a flexible interface for processing various file formats
    (PDFs, documents, images, etc.) into text content that can be used for
    vector store ingestion, RAG applications, or standalone content extraction.
    The API supports:
    - Multiple file formats through extensible provider architecture
    - Configurable processing options per provider
    - Integration with vector store chunking strategies
    - Optional embedding generation for chunks
    - Rich metadata about processing results
    Future providers can extend this interface to support additional formats,
    processing capabilities, and optimization strategies.
    """
    @webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def process_file(
        self,
        file_data: bytes,
        filename: str,
        options: dict[str, Any] | None = None,
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
        include_embeddings: bool = False,
    ) -> ProcessedContent:
        """
        Process a file into structured content with optional chunking and embeddings.
        This method processes raw file data and converts it into text content for applications such as vector store ingestion.
        :param file_data: Raw bytes of the file to process.
        :param filename: Original filename for format detection.
        :param options: Provider-specific processing options (e.g., OCR settings, output format).
        :param chunking_strategy: Optional strategy for splitting content into chunks.
        :param include_embeddings: Whether to generate embeddings for chunks.
        :returns: ProcessedContent with extracted text, optional chunks, and metadata.
        """
        ...
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -16,6 +16,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.datatypes import ExternalApiSpec
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.file_processor import FileProcessor
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.files: Files,
        Api.prompts: Prompts,
        Api.conversations: Conversations,
        Api.file_processor: FileProcessor,
    }
    if external_apis:
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -29,6 +29,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
    file_processor:
    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
 - file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  file_processor:
  - provider_id: reference
    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -30,6 +30,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
    file_processor:
    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
 - file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  file_processor:
  - provider_id: reference
    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
 - file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  file_processor:
  - provider_id: reference
    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@ -30,6 +30,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
    file_processor:
    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
 - file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  file_processor:
  - provider_id: reference
    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
 - file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  file_processor:
  - provider_id: reference
    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
            BuildProvider(provider_type="remote::weaviate"),
        ],
        "files": [BuildProvider(provider_type="inline::localfs")],
        "file_processor": [BuildProvider(provider_type="inline::reference")],
        "safety": [
            BuildProvider(provider_type="inline::llama-guard"),
            BuildProvider(provider_type="inline::code-scanner"),
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@ -45,6 +45,7 @@ CATEGORIES = [
    "providers",
    "models",
    "files",
    "file_processor",
    "vector_io",
    "tool_runtime",
    "cli",
--- a/src/llama_stack/providers/inline/file_processor/init.py
+++ b/src/llama_stack/providers/inline/file_processor/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/providers/inline/file_processor/reference/init.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/init.py
@ -0,0 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .config import ReferenceFileProcessorImplConfig
 async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps):
    from .reference import ReferenceFileProcessorImpl
    impl = ReferenceFileProcessorImpl(config, deps)
    await impl.initialize()
    return impl
--- a/src/llama_stack/providers/inline/file_processor/reference/config.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/config.py
@ -0,0 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pydantic import BaseModel
 class ReferenceFileProcessorImplConfig(BaseModel):
    """Configuration for the reference file processor implementation."""
    @staticmethod
    def sample_run_config(**kwargs):
        return {}
--- a/src/llama_stack/providers/inline/file_processor/reference/reference.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/reference.py
@ -0,0 +1,42 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any
 from llama_stack.apis.file_processor import FileProcessor, ProcessedContent
 from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
 from .config import ReferenceFileProcessorImplConfig
 class ReferenceFileProcessorImpl(FileProcessor):
    """Reference implementation of the FileProcessor API."""
    def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]):
        self.config = config
        self.deps = deps
    async def initialize(self) -> None:
        pass
    async def process_file(
        self,
        file_data: bytes,
        filename: str,
        options: dict[str, Any] | None = None,
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
        include_embeddings: bool = False,
    ) -> ProcessedContent:
        """Process a file into structured content."""
        return ProcessedContent(
            content="Placeholder content",
            chunks=None,
            embeddings=None,
            metadata={
                "processor": "reference",
                "filename": filename,
            },
        )
--- a/src/llama_stack/providers/registry/file_processor.py
+++ b/src/llama_stack/providers/registry/file_processor.py
@ -0,0 +1,20 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
 def available_providers() -> list[ProviderSpec]:
    return [
        InlineProviderSpec(
            api=Api.file_processor,
            provider_type="inline::reference",
            pip_packages=[],
            module="llama_stack.providers.inline.file_processor.reference",
            config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig",
            description="Reference file processor implementation (placeholder for development)",
        ),
    ]