init: first remote llamacpp implementation

This commit is contained in:
Young Han 2025-07-12 16:28:08 -07:00
parent 8374d4cefd
commit ec73d0d55b
9 changed files with 675 additions and 0 deletions

View file

@ -0,0 +1,17 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.inference import InferenceProvider
from .config import LlamaCppImplConfig
async def get_adapter_impl(config: LlamaCppImplConfig, _deps) -> InferenceProvider:
# import dynamically so the import is used only when it is needed
from .llamacpp import LlamaCppInferenceAdapter
adapter = LlamaCppInferenceAdapter(config)
return adapter

View file

@ -0,0 +1,40 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Union
from llama_stack.schema_utils import json_schema_type
from pydantic import BaseModel, Field
class LlamaCppProviderDataValidator(BaseModel):
llamacpp_api_key: Union[str, None] = Field(
default=None,
description="API key for llama.cpp server (optional for local servers)",
)
@json_schema_type
class LlamaCppImplConfig(BaseModel):
api_key: Union[str, None] = Field(
default=None,
description="The llama.cpp server API key (optional for local servers)",
)
openai_compat_api_base: str = Field(
default="http://localhost:8080/v1",
description="The URL for the llama.cpp server with OpenAI-compatible API",
)
@classmethod
def sample_run_config(
cls, api_key: str = "${env.LLAMACPP_API_KEY:}"
) -> dict[str, Any]:
return {
"openai_compat_api_base": "${env.LLAMACPP_URL:http://localhost:8080}/v1",
"api_key": api_key,
}

View file

@ -0,0 +1,36 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.models.models import Model
from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin,
)
class LlamaCppInferenceAdapter(LiteLLMOpenAIMixin):
_config: LlamaCppImplConfig
def __init__(self, config: LlamaCppImplConfig):
LiteLLMOpenAIMixin.__init__(
self,
model_entries=[], # llama.cpp can work with any GGUF model
api_key_from_config=config.api_key,
provider_data_api_key_field="llamacpp_api_key",
openai_compat_api_base=config.openai_compat_api_base,
)
self.config = config
async def register_model(self, model: Model) -> Model:
# llama.cpp can work with any GGUF model, so we accept any model name
# without validation against a predefined list
return model
async def initialize(self):
await super().initialize()
async def shutdown(self):
await super().shutdown()

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .llamacpp import get_distribution_template # noqa: F401

View file

@ -0,0 +1,38 @@
version: "2"
distribution_spec:
description: Use llama.cpp server for running LLM inference
providers:
inference:
- remote::llamacpp
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
eval:
- inline::meta-reference
datasetio:
- remote::huggingface
- inline::localfs
scoring:
- inline::basic
- inline::llm-as-judge
- inline::braintrust
files:
- inline::localfs
tool_runtime:
- remote::brave-search
- remote::tavily-search
- remote::wolfram-alpha
- inline::rag-runtime
- remote::model-context-protocol
image_type: conda
additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]

View file

@ -0,0 +1,82 @@
# Llama Stack with llama.cpp
This template shows you how to run Llama Stack with [llama.cpp](https://github.com/ggerganov/llama.cpp) as the inference provider.
## Prerequisites
1. **Install llama.cpp**: Follow the installation instructions from the [llama.cpp repository](https://github.com/ggerganov/llama.cpp)
2. **Download a model**: Download a GGUF format model file (e.g., from Hugging Face)
## Starting llama.cpp Server
Before running Llama Stack, you need to start the llama.cpp server:
```bash
# Example: Start llama.cpp server with a model
./llama-server -m /path/to/your/model.gguf -c 4096 --host 0.0.0.0 --port 8080
```
Common llama.cpp server options:
- `-m`: Path to the GGUF model file
- `-c`: Context size (default: 512)
- `--host`: Host to bind to (default: 127.0.0.1)
- `--port`: Port to bind to (default: 8080)
- `-ngl`: Number of layers to offload to GPU
- `--chat-template`: Chat template to use
## Environment Variables
Set these environment variables before running Llama Stack:
```bash
export LLAMACPP_URL=http://localhost:8080 # URL of your llama.cpp server (without /v1 suffix)
export INFERENCE_MODEL=your-model-name # Name/identifier for your model
export LLAMACPP_API_KEY="" # API key (leave empty for local servers)
```
## Running Llama Stack
```bash
llama stack run llamacpp
```
## Configuration
The template uses the following configuration:
- **Inference Provider**: `remote::llamacpp` - Connects to your llama.cpp server via OpenAI-compatible API
- **Default URL**: `http://localhost:8080` (configurable via `LLAMACPP_URL`)
- **Vector Store**: FAISS for local vector storage
- **Safety**: Llama Guard for content safety
- **Other providers**: Standard Meta reference implementations
## Model Support
This template works with any GGUF format model supported by llama.cpp, including:
- Llama 2/3 models
- Code Llama models
- Other transformer-based models converted to GGUF format
## Troubleshooting
1. **Connection refused**: Make sure your llama.cpp server is running and accessible
2. **Model not found**: Verify the model path and that the GGUF file exists
3. **Out of memory**: Reduce context size (`-c`) or use GPU offloading (`-ngl`)
4. **Slow inference**: Consider using GPU acceleration or quantized models
## Advanced Configuration
You can customize the llama.cpp server configuration by modifying the server startup command. For production use, consider:
- Using GPU acceleration with `-ngl` parameter
- Adjusting batch size with `-b` parameter
- Setting appropriate context size with `-c` parameter
- Using multiple threads with `-t` parameter
For more llama.cpp server options, run:
```bash
./llama-server --help
```

View file

@ -0,0 +1,156 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import (
ModelInput,
Provider,
ShieldInput,
ToolGroupInput,
)
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::llamacpp"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
"eval": ["inline::meta-reference"],
"datasetio": ["remote::huggingface", "inline::localfs"],
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
"files": ["inline::localfs"],
"tool_runtime": [
"remote::brave-search",
"remote::tavily-search",
"remote::wolfram-alpha",
"inline::rag-runtime",
"remote::model-context-protocol",
],
}
name = "llamacpp"
inference_provider = Provider(
provider_id="llamacpp",
provider_type="remote::llamacpp",
config=LlamaCppImplConfig.sample_run_config(),
)
vector_io_provider_faiss = Provider(
provider_id="faiss",
provider_type="inline::faiss",
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
files_provider = Provider(
provider_id="meta-reference-files",
provider_type="inline::localfs",
config=LocalfsFilesImplConfig.sample_run_config(
f"~/.llama/distributions/{name}"
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="llamacpp",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="llamacpp",
)
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
default_tool_groups = [
ToolGroupInput(
toolgroup_id="builtin::websearch",
provider_id="tavily-search",
),
ToolGroupInput(
toolgroup_id="builtin::rag",
provider_id="rag-runtime",
),
ToolGroupInput(
toolgroup_id="builtin::wolfram_alpha",
provider_id="wolfram-alpha",
),
]
return DistributionTemplate(
name=name,
distro_type="self_hosted",
description="Use llama.cpp server for running LLM inference",
container_image=None,
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
"files": [files_provider],
},
default_models=[inference_model, embedding_model],
default_tool_groups=default_tool_groups,
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
"files": [files_provider],
"safety": [
Provider(
provider_id="llama-guard",
provider_type="inline::llama-guard",
config={},
),
],
},
default_models=[
inference_model,
safety_model,
embedding_model,
],
default_shields=[
ShieldInput(
shield_id="${env.SAFETY_MODEL}",
provider_id="llama-guard",
),
],
default_tool_groups=default_tool_groups,
),
},
run_config_env_vars={
"LLAMA_STACK_PORT": (
"8321",
"Port for the Llama Stack distribution server",
),
"LLAMACPP_URL": (
"http://localhost:8080",
"URL of the llama.cpp server (without /v1 suffix)",
),
"LLAMACPP_API_KEY": (
"",
"API key for llama.cpp server (leave empty for local servers)",
),
"INFERENCE_MODEL": (
"llama-model",
"Inference model identifier for llama.cpp server",
),
"SAFETY_MODEL": (
"llama-guard",
"Safety model identifier for llama.cpp server",
),
},
)

View file

@ -0,0 +1,152 @@
version: "2"
image_name: llamacpp
apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: llamacpp
provider_type: remote::llamacpp
config:
openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1
api_key: ${env.LLAMACPP_API_KEY:}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db
responses_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db
- provider_id: localfs
provider_type: inline::localfs
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db
scoring:
- provider_id: basic
provider_type: inline::basic
config: {}
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
config: {}
- provider_id: braintrust
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
- provider_id: wolfram-alpha
provider_type: remote::wolfram-alpha
config:
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db
inference_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: llamacpp
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: llamacpp
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
shields:
- shield_id: ${env.SAFETY_MODEL}
provider_id: llama-guard
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
provider_id: wolfram-alpha
server:
port: 8321

View file

@ -0,0 +1,147 @@
version: "2"
image_name: llamacpp
apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: llamacpp
provider_type: remote::llamacpp
config:
openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1
api_key: ${env.LLAMACPP_API_KEY:}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db
responses_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db
- provider_id: localfs
provider_type: inline::localfs
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db
scoring:
- provider_id: basic
provider_type: inline::basic
config: {}
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
config: {}
- provider_id: braintrust
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
- provider_id: wolfram-alpha
provider_type: remote::wolfram-alpha
config:
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db
inference_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: llamacpp
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
shields: []
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
provider_id: wolfram-alpha
server:
port: 8321