mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
feat(api): add file_processor API skeleton
This change adds a file_processor API skeleton that provides a foundationfor converting files into structured content for vector store ingestionwith support for chunking strategies and optional embedding generation. Signed-off-by: Alina Ryan <aliryan@redhat.com>
This commit is contained in:
parent
6147321083
commit
2664aeee2a
21 changed files with 258 additions and 0 deletions
10
docs/docs/providers/file_processor/index.mdx
Normal file
10
docs/docs/providers/file_processor/index.mdx
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
sidebar_label: File Processor
|
||||
title: File_Processor
|
||||
---
|
||||
|
||||
# File_Processor
|
||||
|
||||
## Overview
|
||||
|
||||
This section contains documentation for all available providers for the **file_processor** API.
|
||||
17
docs/docs/providers/file_processor/inline_reference.mdx
Normal file
17
docs/docs/providers/file_processor/inline_reference.mdx
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
description: "Reference file processor implementation (placeholder for development)"
|
||||
sidebar_label: Reference
|
||||
title: inline::reference
|
||||
---
|
||||
|
||||
# inline::reference
|
||||
|
||||
## Description
|
||||
|
||||
Reference file processor implementation (placeholder for development)
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
```
|
||||
|
|
@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
|||
files = "files"
|
||||
prompts = "prompts"
|
||||
conversations = "conversations"
|
||||
file_processor = "file_processor"
|
||||
|
||||
# built-in API
|
||||
inspect = "inspect"
|
||||
|
|
|
|||
7
src/llama_stack/apis/file_processor/__init__.py
Normal file
7
src/llama_stack/apis/file_processor/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .file_processor import *
|
||||
96
src/llama_stack/apis/file_processor/file_processor.py
Normal file
96
src/llama_stack/apis/file_processor/file_processor.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any, Protocol, runtime_checkable
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llama_stack.apis.common.tracing import telemetry_traceable
|
||||
from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy
|
||||
from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
|
||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ProcessFileRequest(BaseModel):
|
||||
"""Request for processing a file into structured content."""
|
||||
|
||||
file_data: bytes
|
||||
"""Raw file data to process."""
|
||||
|
||||
filename: str
|
||||
"""Original filename for format detection and processing hints."""
|
||||
|
||||
options: dict[str, Any] | None = None
|
||||
"""Optional processing options. Provider-specific parameters."""
|
||||
|
||||
chunking_strategy: VectorStoreChunkingStrategy | None = None
|
||||
"""Optional chunking strategy for splitting content into chunks."""
|
||||
|
||||
include_embeddings: bool = False
|
||||
"""Whether to generate embeddings for chunks."""
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ProcessedContent(BaseModel):
|
||||
"""Result of file processing operation."""
|
||||
|
||||
content: str
|
||||
"""Extracted text content from the file."""
|
||||
|
||||
chunks: list[Chunk] | None = None
|
||||
"""Optional chunks if chunking strategy was provided."""
|
||||
|
||||
embeddings: list[list[float]] | None = None
|
||||
"""Optional embeddings for chunks if requested."""
|
||||
|
||||
metadata: dict[str, Any]
|
||||
"""Processing metadata including processor name, timing, and provider-specific data."""
|
||||
|
||||
|
||||
@telemetry_traceable
|
||||
@runtime_checkable
|
||||
class FileProcessor(Protocol):
|
||||
"""
|
||||
File Processor API for converting files into structured, processable content.
|
||||
|
||||
This API provides a flexible interface for processing various file formats
|
||||
(PDFs, documents, images, etc.) into text content that can be used for
|
||||
vector store ingestion, RAG applications, or standalone content extraction.
|
||||
|
||||
The API supports:
|
||||
- Multiple file formats through extensible provider architecture
|
||||
- Configurable processing options per provider
|
||||
- Integration with vector store chunking strategies
|
||||
- Optional embedding generation for chunks
|
||||
- Rich metadata about processing results
|
||||
|
||||
Future providers can extend this interface to support additional formats,
|
||||
processing capabilities, and optimization strategies.
|
||||
"""
|
||||
|
||||
@webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA)
|
||||
async def process_file(
|
||||
self,
|
||||
file_data: bytes,
|
||||
filename: str,
|
||||
options: dict[str, Any] | None = None,
|
||||
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||
include_embeddings: bool = False,
|
||||
) -> ProcessedContent:
|
||||
"""
|
||||
Process a file into structured content with optional chunking and embeddings.
|
||||
|
||||
This method processes raw file data and converts it into text content for applications such as vector store ingestion.
|
||||
|
||||
:param file_data: Raw bytes of the file to process.
|
||||
:param filename: Original filename for format detection.
|
||||
:param options: Provider-specific processing options (e.g., OCR settings, output format).
|
||||
:param chunking_strategy: Optional strategy for splitting content into chunks.
|
||||
:param include_embeddings: Whether to generate embeddings for chunks.
|
||||
:returns: ProcessedContent with extracted text, optional chunks, and metadata.
|
||||
"""
|
||||
...
|
||||
|
|
@ -16,6 +16,7 @@ from llama_stack.apis.datasetio import DatasetIO
|
|||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.datatypes import ExternalApiSpec
|
||||
from llama_stack.apis.eval import Eval
|
||||
from llama_stack.apis.file_processor import FileProcessor
|
||||
from llama_stack.apis.files import Files
|
||||
from llama_stack.apis.inference import Inference, InferenceProvider
|
||||
from llama_stack.apis.inspect import Inspect
|
||||
|
|
@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
|
|||
Api.files: Files,
|
||||
Api.prompts: Prompts,
|
||||
Api.conversations: Conversations,
|
||||
Api.file_processor: FileProcessor,
|
||||
}
|
||||
|
||||
if external_apis:
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@ distribution_spec:
|
|||
- provider_type: remote::weaviate
|
||||
files:
|
||||
- provider_type: inline::localfs
|
||||
file_processor:
|
||||
- provider_type: inline::reference
|
||||
safety:
|
||||
- provider_type: inline::llama-guard
|
||||
- provider_type: inline::code-scanner
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ apis:
|
|||
- batches
|
||||
- datasetio
|
||||
- eval
|
||||
- file_processor
|
||||
- files
|
||||
- inference
|
||||
- post_training
|
||||
|
|
@ -154,6 +155,9 @@ providers:
|
|||
metadata_store:
|
||||
table_name: files_metadata
|
||||
backend: sql_default
|
||||
file_processor:
|
||||
- provider_id: reference
|
||||
provider_type: inline::reference
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
|||
|
|
@ -30,6 +30,8 @@ distribution_spec:
|
|||
- provider_type: remote::weaviate
|
||||
files:
|
||||
- provider_type: inline::localfs
|
||||
file_processor:
|
||||
- provider_type: inline::reference
|
||||
safety:
|
||||
- provider_type: inline::llama-guard
|
||||
- provider_type: inline::code-scanner
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ apis:
|
|||
- batches
|
||||
- datasetio
|
||||
- eval
|
||||
- file_processor
|
||||
- files
|
||||
- inference
|
||||
- post_training
|
||||
|
|
@ -154,6 +155,9 @@ providers:
|
|||
metadata_store:
|
||||
table_name: files_metadata
|
||||
backend: sql_default
|
||||
file_processor:
|
||||
- provider_id: reference
|
||||
provider_type: inline::reference
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ apis:
|
|||
- batches
|
||||
- datasetio
|
||||
- eval
|
||||
- file_processor
|
||||
- files
|
||||
- inference
|
||||
- post_training
|
||||
|
|
@ -154,6 +155,9 @@ providers:
|
|||
metadata_store:
|
||||
table_name: files_metadata
|
||||
backend: sql_default
|
||||
file_processor:
|
||||
- provider_id: reference
|
||||
provider_type: inline::reference
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
|||
|
|
@ -30,6 +30,8 @@ distribution_spec:
|
|||
- provider_type: remote::weaviate
|
||||
files:
|
||||
- provider_type: inline::localfs
|
||||
file_processor:
|
||||
- provider_type: inline::reference
|
||||
safety:
|
||||
- provider_type: inline::llama-guard
|
||||
- provider_type: inline::code-scanner
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ apis:
|
|||
- batches
|
||||
- datasetio
|
||||
- eval
|
||||
- file_processor
|
||||
- files
|
||||
- inference
|
||||
- post_training
|
||||
|
|
@ -154,6 +155,9 @@ providers:
|
|||
metadata_store:
|
||||
table_name: files_metadata
|
||||
backend: sql_default
|
||||
file_processor:
|
||||
- provider_id: reference
|
||||
provider_type: inline::reference
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ apis:
|
|||
- batches
|
||||
- datasetio
|
||||
- eval
|
||||
- file_processor
|
||||
- files
|
||||
- inference
|
||||
- post_training
|
||||
|
|
@ -154,6 +155,9 @@ providers:
|
|||
metadata_store:
|
||||
table_name: files_metadata
|
||||
backend: sql_default
|
||||
file_processor:
|
||||
- provider_id: reference
|
||||
provider_type: inline::reference
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
|
|||
BuildProvider(provider_type="remote::weaviate"),
|
||||
],
|
||||
"files": [BuildProvider(provider_type="inline::localfs")],
|
||||
"file_processor": [BuildProvider(provider_type="inline::reference")],
|
||||
"safety": [
|
||||
BuildProvider(provider_type="inline::llama-guard"),
|
||||
BuildProvider(provider_type="inline::code-scanner"),
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ CATEGORIES = [
|
|||
"providers",
|
||||
"models",
|
||||
"files",
|
||||
"file_processor",
|
||||
"vector_io",
|
||||
"tool_runtime",
|
||||
"cli",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import ReferenceFileProcessorImplConfig
|
||||
|
||||
|
||||
async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps):
|
||||
from .reference import ReferenceFileProcessorImpl
|
||||
|
||||
impl = ReferenceFileProcessorImpl(config, deps)
|
||||
await impl.initialize()
|
||||
return impl
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ReferenceFileProcessorImplConfig(BaseModel):
|
||||
"""Configuration for the reference file processor implementation."""
|
||||
|
||||
@staticmethod
|
||||
def sample_run_config(**kwargs):
|
||||
return {}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
from llama_stack.apis.file_processor import FileProcessor, ProcessedContent
|
||||
from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
|
||||
|
||||
from .config import ReferenceFileProcessorImplConfig
|
||||
|
||||
|
||||
class ReferenceFileProcessorImpl(FileProcessor):
|
||||
"""Reference implementation of the FileProcessor API."""
|
||||
|
||||
def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]):
|
||||
self.config = config
|
||||
self.deps = deps
|
||||
|
||||
async def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
async def process_file(
|
||||
self,
|
||||
file_data: bytes,
|
||||
filename: str,
|
||||
options: dict[str, Any] | None = None,
|
||||
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||
include_embeddings: bool = False,
|
||||
) -> ProcessedContent:
|
||||
"""Process a file into structured content."""
|
||||
return ProcessedContent(
|
||||
content="Placeholder content",
|
||||
chunks=None,
|
||||
embeddings=None,
|
||||
metadata={
|
||||
"processor": "reference",
|
||||
"filename": filename,
|
||||
},
|
||||
)
|
||||
20
src/llama_stack/providers/registry/file_processor.py
Normal file
20
src/llama_stack/providers/registry/file_processor.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
|
||||
|
||||
|
||||
def available_providers() -> list[ProviderSpec]:
|
||||
return [
|
||||
InlineProviderSpec(
|
||||
api=Api.file_processor,
|
||||
provider_type="inline::reference",
|
||||
pip_packages=[],
|
||||
module="llama_stack.providers.inline.file_processor.reference",
|
||||
config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig",
|
||||
description="Reference file processor implementation (placeholder for development)",
|
||||
),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue