diff --git a/llama_stack/providers/remote/inference/llamacpp/__init__.py b/llama_stack/providers/remote/inference/llamacpp/__init__.py new file mode 100644 index 000000000..d8f766cd2 --- /dev/null +++ b/llama_stack/providers/remote/inference/llamacpp/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import InferenceProvider + +from .config import LlamaCppImplConfig + + +async def get_adapter_impl(config: LlamaCppImplConfig, _deps) -> InferenceProvider: + # import dynamically so the import is used only when it is needed + from .llamacpp import LlamaCppInferenceAdapter + + adapter = LlamaCppInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/llamacpp/config.py b/llama_stack/providers/remote/inference/llamacpp/config.py new file mode 100644 index 000000000..8285ff065 --- /dev/null +++ b/llama_stack/providers/remote/inference/llamacpp/config.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Union + +from llama_stack.schema_utils import json_schema_type + +from pydantic import BaseModel, Field + + +class LlamaCppProviderDataValidator(BaseModel): + llamacpp_api_key: Union[str, None] = Field( + default=None, + description="API key for llama.cpp server (optional for local servers)", + ) + + +@json_schema_type +class LlamaCppImplConfig(BaseModel): + api_key: Union[str, None] = Field( + default=None, + description="The llama.cpp server API key (optional for local servers)", + ) + + openai_compat_api_base: str = Field( + default="http://localhost:8080/v1", + description="The URL for the llama.cpp server with OpenAI-compatible API", + ) + + @classmethod + def sample_run_config( + cls, api_key: str = "${env.LLAMACPP_API_KEY:}" + ) -> dict[str, Any]: + return { + "openai_compat_api_base": "${env.LLAMACPP_URL:http://localhost:8080}/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/llamacpp/llamacpp.py b/llama_stack/providers/remote/inference/llamacpp/llamacpp.py new file mode 100644 index 000000000..63c002a9a --- /dev/null +++ b/llama_stack/providers/remote/inference/llamacpp/llamacpp.py @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.models.models import Model +from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import ( + LiteLLMOpenAIMixin, +) + + +class LlamaCppInferenceAdapter(LiteLLMOpenAIMixin): + _config: LlamaCppImplConfig + + def __init__(self, config: LlamaCppImplConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=[], # llama.cpp can work with any GGUF model + api_key_from_config=config.api_key, + provider_data_api_key_field="llamacpp_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def register_model(self, model: Model) -> Model: + # llama.cpp can work with any GGUF model, so we accept any model name + # without validation against a predefined list + return model + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/templates/llamacpp/__init__.py b/llama_stack/templates/llamacpp/__init__.py new file mode 100644 index 000000000..dbe3fc50e --- /dev/null +++ b/llama_stack/templates/llamacpp/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .llamacpp import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/llamacpp/build.yaml b/llama_stack/templates/llamacpp/build.yaml new file mode 100644 index 000000000..14c57e917 --- /dev/null +++ b/llama_stack/templates/llamacpp/build.yaml @@ -0,0 +1,38 @@ +version: "2" +distribution_spec: + description: Use llama.cpp server for running LLM inference + providers: + inference: + - remote::llamacpp + - inline::sentence-transformers + vector_io: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + files: + - inline::localfs + tool_runtime: + - remote::brave-search + - remote::tavily-search + - remote::wolfram-alpha + - inline::rag-runtime + - remote::model-context-protocol +image_type: conda +additional_pip_packages: + - aiosqlite + - sqlalchemy[asyncio] diff --git a/llama_stack/templates/llamacpp/doc_template.md b/llama_stack/templates/llamacpp/doc_template.md new file mode 100644 index 000000000..f47e72034 --- /dev/null +++ b/llama_stack/templates/llamacpp/doc_template.md @@ -0,0 +1,82 @@ +# Llama Stack with llama.cpp + +This template shows you how to run Llama Stack with [llama.cpp](https://github.com/ggerganov/llama.cpp) as the inference provider. + +## Prerequisites + +1. **Install llama.cpp**: Follow the installation instructions from the [llama.cpp repository](https://github.com/ggerganov/llama.cpp) +2. **Download a model**: Download a GGUF format model file (e.g., from Hugging Face) + +## Starting llama.cpp Server + +Before running Llama Stack, you need to start the llama.cpp server: + +```bash +# Example: Start llama.cpp server with a model +./llama-server -m /path/to/your/model.gguf -c 4096 --host 0.0.0.0 --port 8080 +``` + +Common llama.cpp server options: + +- `-m`: Path to the GGUF model file +- `-c`: Context size (default: 512) +- `--host`: Host to bind to (default: 127.0.0.1) +- `--port`: Port to bind to (default: 8080) +- `-ngl`: Number of layers to offload to GPU +- `--chat-template`: Chat template to use + +## Environment Variables + +Set these environment variables before running Llama Stack: + +```bash +export LLAMACPP_URL=http://localhost:8080 # URL of your llama.cpp server (without /v1 suffix) +export INFERENCE_MODEL=your-model-name # Name/identifier for your model +export LLAMACPP_API_KEY="" # API key (leave empty for local servers) +``` + +## Running Llama Stack + +```bash +llama stack run llamacpp +``` + +## Configuration + +The template uses the following configuration: + +- **Inference Provider**: `remote::llamacpp` - Connects to your llama.cpp server via OpenAI-compatible API +- **Default URL**: `http://localhost:8080` (configurable via `LLAMACPP_URL`) +- **Vector Store**: FAISS for local vector storage +- **Safety**: Llama Guard for content safety +- **Other providers**: Standard Meta reference implementations + +## Model Support + +This template works with any GGUF format model supported by llama.cpp, including: + +- Llama 2/3 models +- Code Llama models +- Other transformer-based models converted to GGUF format + +## Troubleshooting + +1. **Connection refused**: Make sure your llama.cpp server is running and accessible +2. **Model not found**: Verify the model path and that the GGUF file exists +3. **Out of memory**: Reduce context size (`-c`) or use GPU offloading (`-ngl`) +4. **Slow inference**: Consider using GPU acceleration or quantized models + +## Advanced Configuration + +You can customize the llama.cpp server configuration by modifying the server startup command. For production use, consider: + +- Using GPU acceleration with `-ngl` parameter +- Adjusting batch size with `-b` parameter +- Setting appropriate context size with `-c` parameter +- Using multiple threads with `-t` parameter + +For more llama.cpp server options, run: + +```bash +./llama-server --help +``` diff --git a/llama_stack/templates/llamacpp/llamacpp.py b/llama_stack/templates/llamacpp/llamacpp.py new file mode 100644 index 000000000..2ed0b8ca9 --- /dev/null +++ b/llama_stack/templates/llamacpp/llamacpp.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.apis.models.models import ModelType +from llama_stack.distribution.datatypes import ( + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig +from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::llamacpp"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "files": ["inline::localfs"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "remote::wolfram-alpha", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + name = "llamacpp" + inference_provider = Provider( + provider_id="llamacpp", + provider_type="remote::llamacpp", + config=LlamaCppImplConfig.sample_run_config(), + ) + vector_io_provider_faiss = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) + files_provider = Provider( + provider_id="meta-reference-files", + provider_type="inline::localfs", + config=LocalfsFilesImplConfig.sample_run_config( + f"~/.llama/distributions/{name}" + ), + ) + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="llamacpp", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="llamacpp", + ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::wolfram_alpha", + provider_id="wolfram-alpha", + ), + ] + + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Use llama.cpp server for running LLM inference", + container_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + "vector_io": [vector_io_provider_faiss], + "files": [files_provider], + }, + default_models=[inference_model, embedding_model], + default_tool_groups=default_tool_groups, + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + "vector_io": [vector_io_provider_faiss], + "files": [files_provider], + "safety": [ + Provider( + provider_id="llama-guard", + provider_type="inline::llama-guard", + config={}, + ), + ], + }, + default_models=[ + inference_model, + safety_model, + embedding_model, + ], + default_shields=[ + ShieldInput( + shield_id="${env.SAFETY_MODEL}", + provider_id="llama-guard", + ), + ], + default_tool_groups=default_tool_groups, + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "8321", + "Port for the Llama Stack distribution server", + ), + "LLAMACPP_URL": ( + "http://localhost:8080", + "URL of the llama.cpp server (without /v1 suffix)", + ), + "LLAMACPP_API_KEY": ( + "", + "API key for llama.cpp server (leave empty for local servers)", + ), + "INFERENCE_MODEL": ( + "llama-model", + "Inference model identifier for llama.cpp server", + ), + "SAFETY_MODEL": ( + "llama-guard", + "Safety model identifier for llama.cpp server", + ), + }, + ) diff --git a/llama_stack/templates/llamacpp/run-with-safety.yaml b/llama_stack/templates/llamacpp/run-with-safety.yaml new file mode 100644 index 000000000..7cfaf7cb4 --- /dev/null +++ b/llama_stack/templates/llamacpp/run-with-safety.yaml @@ -0,0 +1,152 @@ +version: "2" +image_name: llamacpp +apis: + - agents + - datasetio + - eval + - files + - inference + - safety + - scoring + - telemetry + - tool_runtime + - vector_io +providers: + inference: + - provider_id: llamacpp + provider_type: remote::llamacpp + config: + openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1 + api_key: ${env.LLAMACPP_API_KEY:} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db + responses_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:\u200B}" + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db +inference_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db +models: + - metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: llamacpp + model_type: llm + - metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: llamacpp + model_type: llm + - metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +shields: + - shield_id: ${env.SAFETY_MODEL} + provider_id: llama-guard +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: + - toolgroup_id: builtin::websearch + provider_id: tavily-search + - toolgroup_id: builtin::rag + provider_id: rag-runtime + - toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321 diff --git a/llama_stack/templates/llamacpp/run.yaml b/llama_stack/templates/llamacpp/run.yaml new file mode 100644 index 000000000..5600ac267 --- /dev/null +++ b/llama_stack/templates/llamacpp/run.yaml @@ -0,0 +1,147 @@ +version: "2" +image_name: llamacpp +apis: + - agents + - datasetio + - eval + - files + - inference + - safety + - scoring + - telemetry + - tool_runtime + - vector_io +providers: + inference: + - provider_id: llamacpp + provider_type: remote::llamacpp + config: + openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1 + api_key: ${env.LLAMACPP_API_KEY:} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db + responses_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:\u200B}" + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db +inference_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db +models: + - metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: llamacpp + model_type: llm + - metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +shields: [] +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: + - toolgroup_id: builtin::websearch + provider_id: tavily-search + - toolgroup_id: builtin::rag + provider_id: rag-runtime + - toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321