init: first remote llamacpp implementation

2025-12-23 03:19:42 +00:00 · 2025-07-12 16:28:08 -07:00 · 2025-07-12 16:28:08 -07:00 · ec73d0d55b
commit ec73d0d55b
parent 8374d4cefd
9 changed files with 675 additions and 0 deletions
--- a/llama_stack/providers/remote/inference/llamacpp/init.py
+++ b/llama_stack/providers/remote/inference/llamacpp/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import InferenceProvider
+
+from .config import LlamaCppImplConfig
+
+
+async def get_adapter_impl(config: LlamaCppImplConfig, _deps) -> InferenceProvider:
+    # import dynamically so the import is used only when it is needed
+    from .llamacpp import LlamaCppInferenceAdapter
+
+    adapter = LlamaCppInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/llamacpp/config.py
+++ b/llama_stack/providers/remote/inference/llamacpp/config.py
@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Union
+
+from llama_stack.schema_utils import json_schema_type
+
+from pydantic import BaseModel, Field
+
+
+class LlamaCppProviderDataValidator(BaseModel):
+    llamacpp_api_key: Union[str, None] = Field(
+        default=None,
+        description="API key for llama.cpp server (optional for local servers)",
+    )
+
+
+@json_schema_type
+class LlamaCppImplConfig(BaseModel):
+    api_key: Union[str, None] = Field(
+        default=None,
+        description="The llama.cpp server API key (optional for local servers)",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="http://localhost:8080/v1",
+        description="The URL for the llama.cpp server with OpenAI-compatible API",
+    )
+
+    @classmethod
+    def sample_run_config(
+        cls, api_key: str = "${env.LLAMACPP_API_KEY:}"
+    ) -> dict[str, Any]:
+        return {
+            "openai_compat_api_base": "${env.LLAMACPP_URL:http://localhost:8080}/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/llamacpp/llamacpp.py
+++ b/llama_stack/providers/remote/inference/llamacpp/llamacpp.py
@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.models.models import Model
+from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import (
+    LiteLLMOpenAIMixin,
+)
+
+
+class LlamaCppInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: LlamaCppImplConfig
+
+    def __init__(self, config: LlamaCppImplConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=[],  # llama.cpp can work with any GGUF model
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="llamacpp_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def register_model(self, model: Model) -> Model:
+        # llama.cpp can work with any GGUF model, so we accept any model name
+        # without validation against a predefined list
+        return model
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/templates/llamacpp/init.py
+++ b/llama_stack/templates/llamacpp/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .llamacpp import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/llamacpp/build.yaml
+++ b/llama_stack/templates/llamacpp/build.yaml
@ -0,0 +1,38 @@
+version: "2"
+distribution_spec:
+  description: Use llama.cpp server for running LLM inference
+  providers:
+    inference:
+      - remote::llamacpp
+      - inline::sentence-transformers
+    vector_io:
+      - inline::faiss
+      - remote::chromadb
+      - remote::pgvector
+    safety:
+      - inline::llama-guard
+    agents:
+      - inline::meta-reference
+    telemetry:
+      - inline::meta-reference
+    eval:
+      - inline::meta-reference
+    datasetio:
+      - remote::huggingface
+      - inline::localfs
+    scoring:
+      - inline::basic
+      - inline::llm-as-judge
+      - inline::braintrust
+    files:
+      - inline::localfs
+    tool_runtime:
+      - remote::brave-search
+      - remote::tavily-search
+      - remote::wolfram-alpha
+      - inline::rag-runtime
+      - remote::model-context-protocol
+image_type: conda
+additional_pip_packages:
+  - aiosqlite
+  - sqlalchemy[asyncio]
--- a/llama_stack/templates/llamacpp/doc_template.md
+++ b/llama_stack/templates/llamacpp/doc_template.md
@ -0,0 +1,82 @@
+# Llama Stack with llama.cpp
+
+This template shows you how to run Llama Stack with [llama.cpp](https://github.com/ggerganov/llama.cpp) as the inference provider.
+
+## Prerequisites
+
+1. **Install llama.cpp**: Follow the installation instructions from the [llama.cpp repository](https://github.com/ggerganov/llama.cpp)
+2. **Download a model**: Download a GGUF format model file (e.g., from Hugging Face)
+
+## Starting llama.cpp Server
+
+Before running Llama Stack, you need to start the llama.cpp server:
+
+```bash
+# Example: Start llama.cpp server with a model
+./llama-server -m /path/to/your/model.gguf -c 4096 --host 0.0.0.0 --port 8080
+```
+
+Common llama.cpp server options:
+
+- `-m`: Path to the GGUF model file
+- `-c`: Context size (default: 512)
+- `--host`: Host to bind to (default: 127.0.0.1)
+- `--port`: Port to bind to (default: 8080)
+- `-ngl`: Number of layers to offload to GPU
+- `--chat-template`: Chat template to use
+
+## Environment Variables
+
+Set these environment variables before running Llama Stack:
+
+```bash
+export LLAMACPP_URL=http://localhost:8080  # URL of your llama.cpp server (without /v1 suffix)
+export INFERENCE_MODEL=your-model-name     # Name/identifier for your model
+export LLAMACPP_API_KEY=""                 # API key (leave empty for local servers)
+```
+
+## Running Llama Stack
+
+```bash
+llama stack run llamacpp
+```
+
+## Configuration
+
+The template uses the following configuration:
+
+- **Inference Provider**: `remote::llamacpp` - Connects to your llama.cpp server via OpenAI-compatible API
+- **Default URL**: `http://localhost:8080` (configurable via `LLAMACPP_URL`)
+- **Vector Store**: FAISS for local vector storage
+- **Safety**: Llama Guard for content safety
+- **Other providers**: Standard Meta reference implementations
+
+## Model Support
+
+This template works with any GGUF format model supported by llama.cpp, including:
+
+- Llama 2/3 models
+- Code Llama models
+- Other transformer-based models converted to GGUF format
+
+## Troubleshooting
+
+1. **Connection refused**: Make sure your llama.cpp server is running and accessible
+2. **Model not found**: Verify the model path and that the GGUF file exists
+3. **Out of memory**: Reduce context size (`-c`) or use GPU offloading (`-ngl`)
+4. **Slow inference**: Consider using GPU acceleration or quantized models
+
+## Advanced Configuration
+
+You can customize the llama.cpp server configuration by modifying the server startup command. For production use, consider:
+
+- Using GPU acceleration with `-ngl` parameter
+- Adjusting batch size with `-b` parameter
+- Setting appropriate context size with `-c` parameter
+- Using multiple threads with `-t` parameter
+
+For more llama.cpp server options, run:
+
+```bash
+./llama-server --help
+```
--- a/llama_stack/templates/llamacpp/llamacpp.py
+++ b/llama_stack/templates/llamacpp/llamacpp.py
@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.remote.inference.llamacpp.config import LlamaCppImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::llamacpp"],
+        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "files": ["inline::localfs"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "remote::wolfram-alpha",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "llamacpp"
+    inference_provider = Provider(
+        provider_id="llamacpp",
+        provider_type="remote::llamacpp",
+        config=LlamaCppImplConfig.sample_run_config(),
+    )
+    vector_io_provider_faiss = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(
+            f"~/.llama/distributions/{name}"
+        ),
+    )
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="llamacpp",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="llamacpp",
+    )
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
+    ]
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Use llama.cpp server for running LLM inference",
+        container_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                    "vector_io": [vector_io_provider_faiss],
+                    "files": [files_provider],
+                },
+                default_models=[inference_model, embedding_model],
+                default_tool_groups=default_tool_groups,
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                    "vector_io": [vector_io_provider_faiss],
+                    "files": [files_provider],
+                    "safety": [
+                        Provider(
+                            provider_id="llama-guard",
+                            provider_type="inline::llama-guard",
+                            config={},
+                        ),
+                    ],
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                    embedding_model,
+                ],
+                default_shields=[
+                    ShieldInput(
+                        shield_id="${env.SAFETY_MODEL}",
+                        provider_id="llama-guard",
+                    ),
+                ],
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+            "LLAMACPP_URL": (
+                "http://localhost:8080",
+                "URL of the llama.cpp server (without /v1 suffix)",
+            ),
+            "LLAMACPP_API_KEY": (
+                "",
+                "API key for llama.cpp server (leave empty for local servers)",
+            ),
+            "INFERENCE_MODEL": (
+                "llama-model",
+                "Inference model identifier for llama.cpp server",
+            ),
+            "SAFETY_MODEL": (
+                "llama-guard",
+                "Safety model identifier for llama.cpp server",
+            ),
+        },
+    )
--- a/llama_stack/templates/llamacpp/run-with-safety.yaml
+++ b/llama_stack/templates/llamacpp/run-with-safety.yaml
@ -0,0 +1,152 @@
+version: "2"
+image_name: llamacpp
+apis:
+  - agents
+  - datasetio
+  - eval
+  - files
+  - inference
+  - safety
+  - scoring
+  - telemetry
+  - tool_runtime
+  - vector_io
+providers:
+  inference:
+    - provider_id: llamacpp
+      provider_type: remote::llamacpp
+      config:
+        openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1
+        api_key: ${env.LLAMACPP_API_KEY:}
+    - provider_id: sentence-transformers
+      provider_type: inline::sentence-transformers
+      config: {}
+  vector_io:
+    - provider_id: faiss
+      provider_type: inline::faiss
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db
+  safety:
+    - provider_id: llama-guard
+      provider_type: inline::llama-guard
+      config: {}
+  agents:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        persistence_store:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db
+        responses_store:
+          type: sqlite
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db
+  telemetry:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+        sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+        sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db
+  eval:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db
+  datasetio:
+    - provider_id: huggingface
+      provider_type: remote::huggingface
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db
+    - provider_id: localfs
+      provider_type: inline::localfs
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db
+  scoring:
+    - provider_id: basic
+      provider_type: inline::basic
+      config: {}
+    - provider_id: llm-as-judge
+      provider_type: inline::llm-as-judge
+      config: {}
+    - provider_id: braintrust
+      provider_type: inline::braintrust
+      config:
+        openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+    - provider_id: meta-reference-files
+      provider_type: inline::localfs
+      config:
+        storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files}
+        metadata_store:
+          type: sqlite
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db
+  tool_runtime:
+    - provider_id: brave-search
+      provider_type: remote::brave-search
+      config:
+        api_key: ${env.BRAVE_SEARCH_API_KEY:}
+        max_results: 3
+    - provider_id: tavily-search
+      provider_type: remote::tavily-search
+      config:
+        api_key: ${env.TAVILY_SEARCH_API_KEY:}
+        max_results: 3
+    - provider_id: rag-runtime
+      provider_type: inline::rag-runtime
+      config: {}
+    - provider_id: model-context-protocol
+      provider_type: remote::model-context-protocol
+      config: {}
+    - provider_id: wolfram-alpha
+      provider_type: remote::wolfram-alpha
+      config:
+        api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db
+models:
+  - metadata: {}
+    model_id: ${env.INFERENCE_MODEL}
+    provider_id: llamacpp
+    model_type: llm
+  - metadata: {}
+    model_id: ${env.SAFETY_MODEL}
+    provider_id: llamacpp
+    model_type: llm
+  - metadata:
+      embedding_dimension: 384
+    model_id: all-MiniLM-L6-v2
+    provider_id: sentence-transformers
+    model_type: embedding
+shields:
+  - shield_id: ${env.SAFETY_MODEL}
+    provider_id: llama-guard
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+  - toolgroup_id: builtin::websearch
+    provider_id: tavily-search
+  - toolgroup_id: builtin::rag
+    provider_id: rag-runtime
+  - toolgroup_id: builtin::wolfram_alpha
+    provider_id: wolfram-alpha
+server:
+  port: 8321
--- a/llama_stack/templates/llamacpp/run.yaml
+++ b/llama_stack/templates/llamacpp/run.yaml
@ -0,0 +1,147 @@
+version: "2"
+image_name: llamacpp
+apis:
+  - agents
+  - datasetio
+  - eval
+  - files
+  - inference
+  - safety
+  - scoring
+  - telemetry
+  - tool_runtime
+  - vector_io
+providers:
+  inference:
+    - provider_id: llamacpp
+      provider_type: remote::llamacpp
+      config:
+        openai_compat_api_base: ${env.LLAMACPP_URL:http://localhost:8080}/v1
+        api_key: ${env.LLAMACPP_API_KEY:}
+    - provider_id: sentence-transformers
+      provider_type: inline::sentence-transformers
+      config: {}
+  vector_io:
+    - provider_id: faiss
+      provider_type: inline::faiss
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/faiss_store.db
+  safety:
+    - provider_id: llama-guard
+      provider_type: inline::llama-guard
+      config:
+        excluded_categories: []
+  agents:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        persistence_store:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/agents_store.db
+        responses_store:
+          type: sqlite
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/responses_store.db
+  telemetry:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+        sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+        sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/trace_store.db
+  eval:
+    - provider_id: meta-reference
+      provider_type: inline::meta-reference
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/meta_reference_eval.db
+  datasetio:
+    - provider_id: huggingface
+      provider_type: remote::huggingface
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/huggingface_datasetio.db
+    - provider_id: localfs
+      provider_type: inline::localfs
+      config:
+        kvstore:
+          type: sqlite
+          namespace: null
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/localfs_datasetio.db
+  scoring:
+    - provider_id: basic
+      provider_type: inline::basic
+      config: {}
+    - provider_id: llm-as-judge
+      provider_type: inline::llm-as-judge
+      config: {}
+    - provider_id: braintrust
+      provider_type: inline::braintrust
+      config:
+        openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+    - provider_id: meta-reference-files
+      provider_type: inline::localfs
+      config:
+        storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/llamacpp/files}
+        metadata_store:
+          type: sqlite
+          db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/files_metadata.db
+  tool_runtime:
+    - provider_id: brave-search
+      provider_type: remote::brave-search
+      config:
+        api_key: ${env.BRAVE_SEARCH_API_KEY:}
+        max_results: 3
+    - provider_id: tavily-search
+      provider_type: remote::tavily-search
+      config:
+        api_key: ${env.TAVILY_SEARCH_API_KEY:}
+        max_results: 3
+    - provider_id: rag-runtime
+      provider_type: inline::rag-runtime
+      config: {}
+    - provider_id: model-context-protocol
+      provider_type: remote::model-context-protocol
+      config: {}
+    - provider_id: wolfram-alpha
+      provider_type: remote::wolfram-alpha
+      config:
+        api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llamacpp}/inference_store.db
+models:
+  - metadata: {}
+    model_id: ${env.INFERENCE_MODEL}
+    provider_id: llamacpp
+    model_type: llm
+  - metadata:
+      embedding_dimension: 384
+    model_id: all-MiniLM-L6-v2
+    provider_id: sentence-transformers
+    model_type: embedding
+shields: []
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+  - toolgroup_id: builtin::websearch
+    provider_id: tavily-search
+  - toolgroup_id: builtin::rag
+    provider_id: rag-runtime
+  - toolgroup_id: builtin::wolfram_alpha
+    provider_id: wolfram-alpha
+server:
+  port: 8321