mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
feat(api): add file_processor API skeleton
This change adds a file_processor API skeleton that provides a foundationfor converting files into structured content for vector store ingestionwith support for chunking strategies and optional embedding generation. Signed-off-by: Alina Ryan <aliryan@redhat.com>
This commit is contained in:
parent
6147321083
commit
2664aeee2a
21 changed files with 258 additions and 0 deletions
10
docs/docs/providers/file_processor/index.mdx
Normal file
10
docs/docs/providers/file_processor/index.mdx
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
sidebar_label: File Processor
|
||||||
|
title: File_Processor
|
||||||
|
---
|
||||||
|
|
||||||
|
# File_Processor
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This section contains documentation for all available providers for the **file_processor** API.
|
||||||
17
docs/docs/providers/file_processor/inline_reference.mdx
Normal file
17
docs/docs/providers/file_processor/inline_reference.mdx
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
---
|
||||||
|
description: "Reference file processor implementation (placeholder for development)"
|
||||||
|
sidebar_label: Reference
|
||||||
|
title: inline::reference
|
||||||
|
---
|
||||||
|
|
||||||
|
# inline::reference
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
Reference file processor implementation (placeholder for development)
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
{}
|
||||||
|
```
|
||||||
|
|
@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
||||||
files = "files"
|
files = "files"
|
||||||
prompts = "prompts"
|
prompts = "prompts"
|
||||||
conversations = "conversations"
|
conversations = "conversations"
|
||||||
|
file_processor = "file_processor"
|
||||||
|
|
||||||
# built-in API
|
# built-in API
|
||||||
inspect = "inspect"
|
inspect = "inspect"
|
||||||
|
|
|
||||||
7
src/llama_stack/apis/file_processor/__init__.py
Normal file
7
src/llama_stack/apis/file_processor/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .file_processor import *
|
||||||
96
src/llama_stack/apis/file_processor/file_processor.py
Normal file
96
src/llama_stack/apis/file_processor/file_processor.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import Any, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from llama_stack.apis.common.tracing import telemetry_traceable
|
||||||
|
from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy
|
||||||
|
from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
|
||||||
|
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class ProcessFileRequest(BaseModel):
|
||||||
|
"""Request for processing a file into structured content."""
|
||||||
|
|
||||||
|
file_data: bytes
|
||||||
|
"""Raw file data to process."""
|
||||||
|
|
||||||
|
filename: str
|
||||||
|
"""Original filename for format detection and processing hints."""
|
||||||
|
|
||||||
|
options: dict[str, Any] | None = None
|
||||||
|
"""Optional processing options. Provider-specific parameters."""
|
||||||
|
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None
|
||||||
|
"""Optional chunking strategy for splitting content into chunks."""
|
||||||
|
|
||||||
|
include_embeddings: bool = False
|
||||||
|
"""Whether to generate embeddings for chunks."""
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class ProcessedContent(BaseModel):
|
||||||
|
"""Result of file processing operation."""
|
||||||
|
|
||||||
|
content: str
|
||||||
|
"""Extracted text content from the file."""
|
||||||
|
|
||||||
|
chunks: list[Chunk] | None = None
|
||||||
|
"""Optional chunks if chunking strategy was provided."""
|
||||||
|
|
||||||
|
embeddings: list[list[float]] | None = None
|
||||||
|
"""Optional embeddings for chunks if requested."""
|
||||||
|
|
||||||
|
metadata: dict[str, Any]
|
||||||
|
"""Processing metadata including processor name, timing, and provider-specific data."""
|
||||||
|
|
||||||
|
|
||||||
|
@telemetry_traceable
|
||||||
|
@runtime_checkable
|
||||||
|
class FileProcessor(Protocol):
|
||||||
|
"""
|
||||||
|
File Processor API for converting files into structured, processable content.
|
||||||
|
|
||||||
|
This API provides a flexible interface for processing various file formats
|
||||||
|
(PDFs, documents, images, etc.) into text content that can be used for
|
||||||
|
vector store ingestion, RAG applications, or standalone content extraction.
|
||||||
|
|
||||||
|
The API supports:
|
||||||
|
- Multiple file formats through extensible provider architecture
|
||||||
|
- Configurable processing options per provider
|
||||||
|
- Integration with vector store chunking strategies
|
||||||
|
- Optional embedding generation for chunks
|
||||||
|
- Rich metadata about processing results
|
||||||
|
|
||||||
|
Future providers can extend this interface to support additional formats,
|
||||||
|
processing capabilities, and optimization strategies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA)
|
||||||
|
async def process_file(
|
||||||
|
self,
|
||||||
|
file_data: bytes,
|
||||||
|
filename: str,
|
||||||
|
options: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
include_embeddings: bool = False,
|
||||||
|
) -> ProcessedContent:
|
||||||
|
"""
|
||||||
|
Process a file into structured content with optional chunking and embeddings.
|
||||||
|
|
||||||
|
This method processes raw file data and converts it into text content for applications such as vector store ingestion.
|
||||||
|
|
||||||
|
:param file_data: Raw bytes of the file to process.
|
||||||
|
:param filename: Original filename for format detection.
|
||||||
|
:param options: Provider-specific processing options (e.g., OCR settings, output format).
|
||||||
|
:param chunking_strategy: Optional strategy for splitting content into chunks.
|
||||||
|
:param include_embeddings: Whether to generate embeddings for chunks.
|
||||||
|
:returns: ProcessedContent with extracted text, optional chunks, and metadata.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
@ -16,6 +16,7 @@ from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.datatypes import ExternalApiSpec
|
from llama_stack.apis.datatypes import ExternalApiSpec
|
||||||
from llama_stack.apis.eval import Eval
|
from llama_stack.apis.eval import Eval
|
||||||
|
from llama_stack.apis.file_processor import FileProcessor
|
||||||
from llama_stack.apis.files import Files
|
from llama_stack.apis.files import Files
|
||||||
from llama_stack.apis.inference import Inference, InferenceProvider
|
from llama_stack.apis.inference import Inference, InferenceProvider
|
||||||
from llama_stack.apis.inspect import Inspect
|
from llama_stack.apis.inspect import Inspect
|
||||||
|
|
@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
|
||||||
Api.files: Files,
|
Api.files: Files,
|
||||||
Api.prompts: Prompts,
|
Api.prompts: Prompts,
|
||||||
Api.conversations: Conversations,
|
Api.conversations: Conversations,
|
||||||
|
Api.file_processor: FileProcessor,
|
||||||
}
|
}
|
||||||
|
|
||||||
if external_apis:
|
if external_apis:
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,8 @@ distribution_spec:
|
||||||
- provider_type: remote::weaviate
|
- provider_type: remote::weaviate
|
||||||
files:
|
files:
|
||||||
- provider_type: inline::localfs
|
- provider_type: inline::localfs
|
||||||
|
file_processor:
|
||||||
|
- provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_type: inline::llama-guard
|
- provider_type: inline::llama-guard
|
||||||
- provider_type: inline::code-scanner
|
- provider_type: inline::code-scanner
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ apis:
|
||||||
- batches
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- file_processor
|
||||||
- files
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
|
|
@ -154,6 +155,9 @@ providers:
|
||||||
metadata_store:
|
metadata_store:
|
||||||
table_name: files_metadata
|
table_name: files_metadata
|
||||||
backend: sql_default
|
backend: sql_default
|
||||||
|
file_processor:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,8 @@ distribution_spec:
|
||||||
- provider_type: remote::weaviate
|
- provider_type: remote::weaviate
|
||||||
files:
|
files:
|
||||||
- provider_type: inline::localfs
|
- provider_type: inline::localfs
|
||||||
|
file_processor:
|
||||||
|
- provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_type: inline::llama-guard
|
- provider_type: inline::llama-guard
|
||||||
- provider_type: inline::code-scanner
|
- provider_type: inline::code-scanner
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ apis:
|
||||||
- batches
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- file_processor
|
||||||
- files
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
|
|
@ -154,6 +155,9 @@ providers:
|
||||||
metadata_store:
|
metadata_store:
|
||||||
table_name: files_metadata
|
table_name: files_metadata
|
||||||
backend: sql_default
|
backend: sql_default
|
||||||
|
file_processor:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ apis:
|
||||||
- batches
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- file_processor
|
||||||
- files
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
|
|
@ -154,6 +155,9 @@ providers:
|
||||||
metadata_store:
|
metadata_store:
|
||||||
table_name: files_metadata
|
table_name: files_metadata
|
||||||
backend: sql_default
|
backend: sql_default
|
||||||
|
file_processor:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,8 @@ distribution_spec:
|
||||||
- provider_type: remote::weaviate
|
- provider_type: remote::weaviate
|
||||||
files:
|
files:
|
||||||
- provider_type: inline::localfs
|
- provider_type: inline::localfs
|
||||||
|
file_processor:
|
||||||
|
- provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_type: inline::llama-guard
|
- provider_type: inline::llama-guard
|
||||||
- provider_type: inline::code-scanner
|
- provider_type: inline::code-scanner
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ apis:
|
||||||
- batches
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- file_processor
|
||||||
- files
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
|
|
@ -154,6 +155,9 @@ providers:
|
||||||
metadata_store:
|
metadata_store:
|
||||||
table_name: files_metadata
|
table_name: files_metadata
|
||||||
backend: sql_default
|
backend: sql_default
|
||||||
|
file_processor:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ apis:
|
||||||
- batches
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- file_processor
|
||||||
- files
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
|
|
@ -154,6 +155,9 @@ providers:
|
||||||
metadata_store:
|
metadata_store:
|
||||||
table_name: files_metadata
|
table_name: files_metadata
|
||||||
backend: sql_default
|
backend: sql_default
|
||||||
|
file_processor:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
|
||||||
BuildProvider(provider_type="remote::weaviate"),
|
BuildProvider(provider_type="remote::weaviate"),
|
||||||
],
|
],
|
||||||
"files": [BuildProvider(provider_type="inline::localfs")],
|
"files": [BuildProvider(provider_type="inline::localfs")],
|
||||||
|
"file_processor": [BuildProvider(provider_type="inline::reference")],
|
||||||
"safety": [
|
"safety": [
|
||||||
BuildProvider(provider_type="inline::llama-guard"),
|
BuildProvider(provider_type="inline::llama-guard"),
|
||||||
BuildProvider(provider_type="inline::code-scanner"),
|
BuildProvider(provider_type="inline::code-scanner"),
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,7 @@ CATEGORIES = [
|
||||||
"providers",
|
"providers",
|
||||||
"models",
|
"models",
|
||||||
"files",
|
"files",
|
||||||
|
"file_processor",
|
||||||
"vector_io",
|
"vector_io",
|
||||||
"tool_runtime",
|
"tool_runtime",
|
||||||
"cli",
|
"cli",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .config import ReferenceFileProcessorImplConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps):
|
||||||
|
from .reference import ReferenceFileProcessorImpl
|
||||||
|
|
||||||
|
impl = ReferenceFileProcessorImpl(config, deps)
|
||||||
|
await impl.initialize()
|
||||||
|
return impl
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ReferenceFileProcessorImplConfig(BaseModel):
|
||||||
|
"""Configuration for the reference file processor implementation."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sample_run_config(**kwargs):
|
||||||
|
return {}
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from llama_stack.apis.file_processor import FileProcessor, ProcessedContent
|
||||||
|
from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
|
||||||
|
|
||||||
|
from .config import ReferenceFileProcessorImplConfig
|
||||||
|
|
||||||
|
|
||||||
|
class ReferenceFileProcessorImpl(FileProcessor):
|
||||||
|
"""Reference implementation of the FileProcessor API."""
|
||||||
|
|
||||||
|
def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]):
|
||||||
|
self.config = config
|
||||||
|
self.deps = deps
|
||||||
|
|
||||||
|
async def initialize(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_file(
|
||||||
|
self,
|
||||||
|
file_data: bytes,
|
||||||
|
filename: str,
|
||||||
|
options: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
include_embeddings: bool = False,
|
||||||
|
) -> ProcessedContent:
|
||||||
|
"""Process a file into structured content."""
|
||||||
|
return ProcessedContent(
|
||||||
|
content="Placeholder content",
|
||||||
|
chunks=None,
|
||||||
|
embeddings=None,
|
||||||
|
metadata={
|
||||||
|
"processor": "reference",
|
||||||
|
"filename": filename,
|
||||||
|
},
|
||||||
|
)
|
||||||
20
src/llama_stack/providers/registry/file_processor.py
Normal file
20
src/llama_stack/providers/registry/file_processor.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
|
||||||
|
|
||||||
|
|
||||||
|
def available_providers() -> list[ProviderSpec]:
|
||||||
|
return [
|
||||||
|
InlineProviderSpec(
|
||||||
|
api=Api.file_processor,
|
||||||
|
provider_type="inline::reference",
|
||||||
|
pip_packages=[],
|
||||||
|
module="llama_stack.providers.inline.file_processor.reference",
|
||||||
|
config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig",
|
||||||
|
description="Reference file processor implementation (placeholder for development)",
|
||||||
|
),
|
||||||
|
]
|
||||||
Loading…
Add table
Add a link
Reference in a new issue