feat(api): add file_processor API skeleton

This change adds a file_processor API skeleton that provides a foundationfor converting files into structured content for vector store ingestionwith support for chunking strategies and optional embedding generation. Signed-off-by: Alina Ryan <aliryan@redhat.com>
2025-12-03 01:48:05 +00:00 · 2025-11-08 23:50:49 -05:00 · 2025-11-08 23:50:49 -05:00 · 2664aeee2a
commit 2664aeee2a
parent 6147321083
21 changed files with 258 additions and 0 deletions
--- a/docs/docs/providers/file_processor/index.mdx
+++ b/docs/docs/providers/file_processor/index.mdx
@ -0,0 +1,10 @@
+---
+sidebar_label: File Processor
+title: File_Processor
+---
+
+# File_Processor
+
+## Overview
+
+This section contains documentation for all available providers for the **file_processor** API.
--- a/docs/docs/providers/file_processor/inline_reference.mdx
+++ b/docs/docs/providers/file_processor/inline_reference.mdx
@ -0,0 +1,17 @@
+---
+description: "Reference file processor implementation (placeholder for development)"
+sidebar_label: Reference
+title: inline::reference
+---
+
+# inline::reference
+
+## Description
+
+Reference file processor implementation (placeholder for development)
+
+## Sample Configuration
+
+```yaml
+{}
+```
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    files = "files"
    prompts = "prompts"
    conversations = "conversations"
+    file_processor = "file_processor"

    # built-in API
    inspect = "inspect"
--- a/src/llama_stack/apis/file_processor/init.py
+++ b/src/llama_stack/apis/file_processor/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .file_processor import *
--- a/src/llama_stack/apis/file_processor/file_processor.py
+++ b/src/llama_stack/apis/file_processor/file_processor.py
@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Protocol, runtime_checkable
+
+from pydantic import BaseModel
+
+from llama_stack.apis.common.tracing import telemetry_traceable
+from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
+from llama_stack.schema_utils import json_schema_type, webmethod
+
+
+@json_schema_type
+class ProcessFileRequest(BaseModel):
+    """Request for processing a file into structured content."""
+
+    file_data: bytes
+    """Raw file data to process."""
+
+    filename: str
+    """Original filename for format detection and processing hints."""
+
+    options: dict[str, Any] | None = None
+    """Optional processing options. Provider-specific parameters."""
+
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
+    """Optional chunking strategy for splitting content into chunks."""
+
+    include_embeddings: bool = False
+    """Whether to generate embeddings for chunks."""
+
+
+@json_schema_type
+class ProcessedContent(BaseModel):
+    """Result of file processing operation."""
+
+    content: str
+    """Extracted text content from the file."""
+
+    chunks: list[Chunk] | None = None
+    """Optional chunks if chunking strategy was provided."""
+
+    embeddings: list[list[float]] | None = None
+    """Optional embeddings for chunks if requested."""
+
+    metadata: dict[str, Any]
+    """Processing metadata including processor name, timing, and provider-specific data."""
+
+
+@telemetry_traceable
+@runtime_checkable
+class FileProcessor(Protocol):
+    """
+    File Processor API for converting files into structured, processable content.
+
+    This API provides a flexible interface for processing various file formats
+    (PDFs, documents, images, etc.) into text content that can be used for
+    vector store ingestion, RAG applications, or standalone content extraction.
+
+    The API supports:
+    - Multiple file formats through extensible provider architecture
+    - Configurable processing options per provider
+    - Integration with vector store chunking strategies
+    - Optional embedding generation for chunks
+    - Rich metadata about processing results
+
+    Future providers can extend this interface to support additional formats,
+    processing capabilities, and optimization strategies.
+    """
+
+    @webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA)
+    async def process_file(
+        self,
+        file_data: bytes,
+        filename: str,
+        options: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+        include_embeddings: bool = False,
+    ) -> ProcessedContent:
+        """
+        Process a file into structured content with optional chunking and embeddings.
+
+        This method processes raw file data and converts it into text content for applications such as vector store ingestion.
+
+        :param file_data: Raw bytes of the file to process.
+        :param filename: Original filename for format detection.
+        :param options: Provider-specific processing options (e.g., OCR settings, output format).
+        :param chunking_strategy: Optional strategy for splitting content into chunks.
+        :param include_embeddings: Whether to generate embeddings for chunks.
+        :returns: ProcessedContent with extracted text, optional chunks, and metadata.
+        """
+        ...
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -16,6 +16,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.datatypes import ExternalApiSpec
 from llama_stack.apis.eval import Eval
+from llama_stack.apis.file_processor import FileProcessor
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.files: Files,
        Api.prompts: Prompts,
        Api.conversations: Conversations,
+        Api.file_processor: FileProcessor,
    }

    if external_apis:
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -29,6 +29,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
+    file_processor:
+    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
+- file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
+  file_processor:
+  - provider_id: reference
+    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -30,6 +30,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
+    file_processor:
+    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
+- file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
+  file_processor:
+  - provider_id: reference
+    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
+- file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
+  file_processor:
+  - provider_id: reference
+    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@ -30,6 +30,8 @@ distribution_spec:
    - provider_type: remote::weaviate
    files:
    - provider_type: inline::localfs
+    file_processor:
+    - provider_type: inline::reference
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
+- file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
+  file_processor:
+  - provider_id: reference
+    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -5,6 +5,7 @@ apis:
 - batches
 - datasetio
 - eval
+- file_processor
 - files
 - inference
 - post_training
@ -154,6 +155,9 @@ providers:
      metadata_store:
        table_name: files_metadata
        backend: sql_default
+  file_processor:
+  - provider_id: reference
+    provider_type: inline::reference
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
            BuildProvider(provider_type="remote::weaviate"),
        ],
        "files": [BuildProvider(provider_type="inline::localfs")],
+        "file_processor": [BuildProvider(provider_type="inline::reference")],
        "safety": [
            BuildProvider(provider_type="inline::llama-guard"),
            BuildProvider(provider_type="inline::code-scanner"),
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@ -45,6 +45,7 @@ CATEGORIES = [
    "providers",
    "models",
    "files",
+    "file_processor",
    "vector_io",
    "tool_runtime",
    "cli",
--- a/src/llama_stack/providers/inline/file_processor/init.py
+++ b/src/llama_stack/providers/inline/file_processor/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/src/llama_stack/providers/inline/file_processor/reference/init.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/init.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import ReferenceFileProcessorImplConfig
+
+
+async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps):
+    from .reference import ReferenceFileProcessorImpl
+
+    impl = ReferenceFileProcessorImpl(config, deps)
+    await impl.initialize()
+    return impl
--- a/src/llama_stack/providers/inline/file_processor/reference/config.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/config.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class ReferenceFileProcessorImplConfig(BaseModel):
+    """Configuration for the reference file processor implementation."""
+
+    @staticmethod
+    def sample_run_config(**kwargs):
+        return {}
--- a/src/llama_stack/providers/inline/file_processor/reference/reference.py
+++ b/src/llama_stack/providers/inline/file_processor/reference/reference.py
@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.file_processor import FileProcessor, ProcessedContent
+from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
+
+from .config import ReferenceFileProcessorImplConfig
+
+
+class ReferenceFileProcessorImpl(FileProcessor):
+    """Reference implementation of the FileProcessor API."""
+
+    def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]):
+        self.config = config
+        self.deps = deps
+
+    async def initialize(self) -> None:
+        pass
+
+    async def process_file(
+        self,
+        file_data: bytes,
+        filename: str,
+        options: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+        include_embeddings: bool = False,
+    ) -> ProcessedContent:
+        """Process a file into structured content."""
+        return ProcessedContent(
+            content="Placeholder content",
+            chunks=None,
+            embeddings=None,
+            metadata={
+                "processor": "reference",
+                "filename": filename,
+            },
+        )
--- a/src/llama_stack/providers/registry/file_processor.py
+++ b/src/llama_stack/providers/registry/file_processor.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+
+
+def available_providers() -> list[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.file_processor,
+            provider_type="inline::reference",
+            pip_packages=[],
+            module="llama_stack.providers.inline.file_processor.reference",
+            config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig",
+            description="Reference file processor implementation (placeholder for development)",
+        ),
+    ]