Update more distribution docs to be simpler and partially codegen'ed

2025-12-03 09:53:45 +00:00 · 2024-11-20 14:44:04 -08:00 · 2024-11-20 14:44:04 -08:00 · 2411a44833
commit 2411a44833
parent e84d4436b5
51 changed files with 1188 additions and 291 deletions
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1,45 +0,0 @@
 version: '2'
 image_name: local
 name: bedrock
 docker_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
    - provider_id: bedrock0
      provider_type: remote::bedrock
      config:
        aws_access_key_id: <AWS_ACCESS_KEY_ID>
        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
        aws_session_token: <AWS_SESSION_TOKEN>
        region_name: <AWS_REGION>
  memory:
    - provider_id: meta0
      provider_type: inline::meta-reference
      config: {}
  safety:
    - provider_id: bedrock0
      provider_type: remote::bedrock
      config:
        aws_access_key_id: <AWS_ACCESS_KEY_ID>
        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
        aws_session_token: <AWS_SESSION_TOKEN>
        region_name: <AWS_REGION>
  agents:
    - provider_id: meta0
      provider_type: inline::meta-reference
      config:
        persistence_store:
          type: sqlite
          db_path: ~/.llama/runtime/kvstore.db
  telemetry:
    - provider_id: meta0
      provider_type: inline::meta-reference
      config: {}
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/databricks/build.yaml
+++ b/distributions/databricks/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/databricks/build.yaml
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,4 +1,32 @@
 {
  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "blobfile",
@ -26,6 +54,33 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "vllm-gpu": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
@ -108,6 +163,33 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "bedrock": [
    "aiosqlite",
    "blobfile",
    "boto3",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
@ -167,5 +249,33 @@
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "hf-endpoint": [
    "aiohttp",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/distributions/hf-endpoint/build.yaml
+++ b/distributions/hf-endpoint/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/hf-endpoint/build.yaml
--- a/distributions/hf-serverless/build.yaml
+++ b/distributions/hf-serverless/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/hf-serverless/build.yaml
--- a/distributions/ollama-gpu/build.yaml
+++ b/distributions/ollama-gpu/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama-gpu/compose.yaml
+++ b/distributions/ollama-gpu/compose.yaml
@ -1,48 +0,0 @@
 services:
  ollama:
    image: ollama/ollama:latest
    network_mode: "host"
    volumes:
      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
    ports:
      - "11434:11434"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
    - ollama
    image: llamastack/distribution-ollama
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
      - ./run.yaml:/root/llamastack-run-ollama.yaml
    ports:
      - "5000:5000"
    # Hack: wait for ollama server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
 volumes:
  ollama:
--- a/distributions/ollama-gpu/run.yaml
+++ b/distributions/ollama-gpu/run.yaml
@ -1,46 +0,0 @@
 version: '2'
 image_name: local
 docker_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: ollama
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
 models:
  - model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
    provider_id: ollama
  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
    provider_id: ollama
 shields:
  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
--- a/distributions/inline-vllm/build.yaml
+++ b/distributions/inline-vllm/build.yaml
--- a/distributions/inline-vllm/compose.yaml
+++ b/distributions/inline-vllm/compose.yaml
--- a/distributions/inline-vllm/run.yaml
+++ b/distributions/inline-vllm/run.yaml
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -6,59 +6,58 @@
 self
 ```
-### Connect to a Llama Stack Bedrock Endpoint
+The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
 - You may connect to Amazon Bedrock APIs for running LLM inference
-The `llamastack/distribution-bedrock` distribution consists of the following provider configurations.
+| API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | inference | `remote::bedrock` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `remote::bedrock` |
 | telemetry | `inline::meta-reference` |
-| **API**         	| **Inference** 	| **Agents**     	| **Memory**     	| **Safety**     	| **Telemetry**  	|
+
-|-----------------	|---------------	|----------------	|----------------	|----------------	|----------------	|
+### Environment Variables
-| **Provider(s)** 	| remote::bedrock | meta-reference 	| meta-reference 	| remote::bedrock | meta-reference 	|
+
 The following environment variables can be configured:
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 ### Docker: Start the Distribution (Single Node CPU)
-> [!NOTE]
+### Prerequisite: API Keys
 > This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
-```
+Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
-$ cd distributions/bedrock && docker compose up
+
 ## Running Llama Stack with AWS Bedrock
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-bedrock \
  --port $LLAMA_STACK_PORT \
  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
-Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g.
+### Via Conda
 ```
 inference:
  - provider_id: bedrock0
    provider_type: remote::bedrock
    config:
      aws_access_key_id: <AWS_ACCESS_KEY_ID>
      aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
      aws_session_token: <AWS_SESSION_TOKEN>
      region_name: <AWS_REGION>
 ```
 ### Conda llama stack run (Single Node CPU)
 ```bash
 llama stack build --template bedrock --image-type conda
-# -- modify run.yaml with valid AWS credentials
+llama stack run ./run.yaml \
-llama stack run ./run.yaml
+  --port $LLAMA_STACK_PORT \
-```
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-### (Optional) Update Model Serving Configuration
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
 ```
 $ llama-stack-client models list
 +------------------------------+------------------------------+---------------+------------+
 | identifier                   | llama_model                  | provider_id   | metadata   |
 +==============================+==============================+===============+============+
 | Llama3.1-8B-Instruct         | meta.llama3-1-8b-instruct-v1:0 | bedrock0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-70B-Instruct        | meta.llama3-1-70b-instruct-v1:0 | bedrock0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-405B-Instruct       | meta.llama3-1-405b-instruct-v1:0 | bedrock0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 ```
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-fireworks \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@ -70,6 +68,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-meta-reference-gpu \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-meta-reference-gpu \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template meta-reference-gpu --image-type conda
-llama stack run ./run.yaml \
+llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -89,7 +85,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -66,9 +66,7 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-ollama \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-tgi \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template tgi --image-type conda
 llama stack run ./run.yaml
-  --port 5001
+  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
-llama stack run ./run-with-safety.yaml
+llama stack run ./run-with-safety.yaml \
-  --port 5001
+  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
-  --env SAFETY_MODEL=$SAFETY_MODEL
+  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-together \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@ -69,6 +67,6 @@ docker run \
 ```bash
 llama stack build --template together --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
    @classmethod
    def sample_run_config(cls):
        return {
-            "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
+            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-            "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
+            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
-            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
+            "max_tokens": "${env.MAX_TOKENS:4096}",
-            "enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
+            "enforce_eager": "${env.ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
        }
    @field_validator("model")
--- a/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/llama_stack/providers/remote/inference/bedrock/config.py
@ -4,11 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_models.schema_utils import json_schema_type
 from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
@json_schema_type
 class BedrockConfig(BedrockBaseConfig):
    pass
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )
    @classmethod
    def sample_run_config(
        cls,
        endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
        api_token: str = "${env.HF_API_TOKEN}",
        **kwargs,
    ):
        return {
            "endpoint_name": endpoint_name,
            "api_token": api_token,
        }
@json_schema_type
 class InferenceAPIImplConfig(BaseModel):
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
        default=None,
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )
    @classmethod
    def sample_run_config(
        cls,
        repo: str = "${env.INFERENCE_MODEL}",
        api_token: str = "${env.HF_API_TOKEN}",
        **kwargs,
    ):
        return {
            "huggingface_repo": repo,
            "api_token": api_token,
        }
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -5,11 +5,9 @@
 # the root directory of this source tree.
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@json_schema_type
 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: Optional[str] = Field(
        default=None,
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
        default=3600,
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )
    @classmethod
    def sample_run_config(cls, **kwargs):
        return {}
--- a/llama_stack/templates/bedrock/init.py
+++ b/llama_stack/templates/bedrock/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .bedrock import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@ -0,0 +1,38 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pathlib import Path
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::bedrock"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["remote::bedrock"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    return DistributionTemplate(
        name="bedrock",
        distro_type="self_hosted",
        description="Use AWS Bedrock for running LLM inference and safety",
        docker_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[],
        run_configs={
            "run.yaml": RunConfigSettings(),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
        },
    )
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@ -1,9 +1,19 @@
 version: '2'
 name: bedrock
 distribution_spec:
-  description: Use Amazon Bedrock APIs.
+  description: Use AWS Bedrock for running LLM inference and safety
  docker_image: null
  providers:
-    inference: remote::bedrock
+    inference:
-    memory: inline::faiss
+    - remote::bedrock
-    safety: inline::llama-guard
+    memory:
-    agents: inline::meta-reference
+    - inline::faiss
-    telemetry: inline::meta-reference
+    - remote::chromadb
    - remote::pgvector
    safety:
    - remote::bedrock
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/bedrock/doc_template.md
+++ b/llama_stack/templates/bedrock/doc_template.md
@ -0,0 +1,63 @@
 # Bedrock Distribution
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 {{ providers_table }}
 {% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
 {% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {% if default_models %}
 ### Models
 The following models are available by default:
 {% for model in default_models %}
 - `{{ model.model_id }} ({{ model.provider_model_id }})`
 {% endfor %}
 {% endif %}
 ### Prerequisite: API Keys
 Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
 ## Running Llama Stack with AWS Bedrock
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
 ### Via Conda
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -0,0 +1,49 @@
 version: '2'
 image_name: bedrock
 docker_image: null
 conda_env: bedrock
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: bedrock
    provider_type: remote::bedrock
    config: {}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
  safety:
  - provider_id: bedrock
    provider_type: remote::bedrock
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
 models: []
 shields: []
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/databricks/build.yaml
+++ b/llama_stack/templates/databricks/build.yaml
@ -1,9 +0,0 @@
 name: databricks
 distribution_spec:
  description: Use Databricks for running LLM inference
  providers:
    inference: remote::databricks
    memory: inline::faiss
    safety: inline::llama-guard
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@ -55,6 +53,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
--- a/llama_stack/templates/hf-endpoint/init.py
+++ b/llama_stack/templates/hf-endpoint/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .hf_endpoint import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@ -1,9 +1,19 @@
 version: '2'
 name: hf-endpoint
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
  docker_image: null
  providers:
-    inference: remote::hf::endpoint
+    inference:
-    memory: inline::faiss
+    - remote::hf::endpoint
-    safety: inline::llama-guard
+    memory:
-    agents: inline::meta-reference
+    - inline::faiss
-    telemetry: inline::meta-reference
+    - remote::chromadb
    - remote::pgvector
    safety:
    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@ -0,0 +1,97 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::hf::endpoint"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="hf-endpoint",
        provider_type="remote::hf::endpoint",
        config=InferenceEndpointImplConfig.sample_run_config(),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="hf-endpoint",
    )
    safety_model = ModelInput(
        model_id="${env.SAFETY_MODEL}",
        provider_id="hf-endpoint-safety",
    )
    return DistributionTemplate(
        name="hf-endpoint",
        distro_type="self_hosted",
        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
        docker_image=None,
        template_path=None,
        providers=providers,
        default_models=[inference_model, safety_model],
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=[inference_model],
            ),
            "run-with-safety.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [
                        inference_provider,
                        Provider(
                            provider_id="hf-endpoint-safety",
                            provider_type="remote::hf::endpoint",
                            config=InferenceEndpointImplConfig.sample_run_config(
                                endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
                            ),
                        ),
                    ]
                },
                default_models=[
                    inference_model,
                    safety_model,
                ],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "HF_API_TOKEN": (
                "hf_...",
                "Hugging Face API token",
            ),
            "INFERENCE_ENDPOINT_NAME": (
                "",
                "HF Inference endpoint name for the main inference model",
            ),
            "SAFETY_INFERENCE_ENDPOINT_NAME": (
                "",
                "HF Inference endpoint for the safety model",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
                "Inference model served by the HF Inference Endpoint",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
                "Safety model served by the HF Inference Endpoint",
            ),
        },
    )
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -0,0 +1,68 @@
 version: '2'
 image_name: hf-endpoint
 docker_image: null
 conda_env: hf-endpoint
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: hf-endpoint
    provider_type: remote::hf::endpoint
    config:
      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
      api_token: ${env.HF_API_TOKEN}
  - provider_id: hf-endpoint-safety
    provider_type: remote::hf::endpoint
    config:
      endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
      api_token: ${env.HF_API_TOKEN}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: hf-endpoint
  provider_model_id: null
 - metadata: {}
  model_id: ${env.SAFETY_MODEL}
  provider_id: hf-endpoint-safety
  provider_model_id: null
 shields:
 - params: null
  shield_id: ${env.SAFETY_MODEL}
  provider_id: null
  provider_shield_id: null
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -0,0 +1,55 @@
 version: '2'
 image_name: hf-endpoint
 docker_image: null
 conda_env: hf-endpoint
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: hf-endpoint
    provider_type: remote::hf::endpoint
    config:
      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
      api_token: ${env.HF_API_TOKEN}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: hf-endpoint
  provider_model_id: null
 shields: []
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/hf-serverless/init.py
+++ b/llama_stack/templates/hf-serverless/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .hf_serverless import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@ -1,9 +1,19 @@
 version: '2'
 name: hf-serverless
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
  docker_image: null
  providers:
-    inference: remote::hf::serverless
+    inference:
-    memory: inline::faiss
+    - remote::hf::serverless
-    safety: inline::llama-guard
+    memory:
-    agents: inline::meta-reference
+    - inline::faiss
-    telemetry: inline::meta-reference
+    - remote::chromadb
    - remote::pgvector
    safety:
    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@ -0,0 +1,89 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::hf::serverless"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="hf-serverless",
        provider_type="remote::hf::serverless",
        config=InferenceAPIImplConfig.sample_run_config(),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="hf-serverless",
    )
    safety_model = ModelInput(
        model_id="${env.SAFETY_MODEL}",
        provider_id="hf-serverless-safety",
    )
    return DistributionTemplate(
        name="hf-serverless",
        distro_type="self_hosted",
        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
        docker_image=None,
        template_path=None,
        providers=providers,
        default_models=[inference_model, safety_model],
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=[inference_model],
            ),
            "run-with-safety.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [
                        inference_provider,
                        Provider(
                            provider_id="hf-serverless-safety",
                            provider_type="remote::hf::serverless",
                            config=InferenceAPIImplConfig.sample_run_config(
                                repo="${env.SAFETY_MODEL}",
                            ),
                        ),
                    ]
                },
                default_models=[
                    inference_model,
                    safety_model,
                ],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "HF_API_TOKEN": (
                "hf_...",
                "Hugging Face API token",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
                "Inference model to be served by the HF Serverless endpoint",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
                "Safety model to be served by the HF Serverless endpoint",
            ),
        },
    )
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -0,0 +1,68 @@
 version: '2'
 image_name: hf-serverless
 docker_image: null
 conda_env: hf-serverless
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: hf-serverless
    provider_type: remote::hf::serverless
    config:
      huggingface_repo: ${env.INFERENCE_MODEL}
      api_token: ${env.HF_API_TOKEN}
  - provider_id: hf-serverless-safety
    provider_type: remote::hf::serverless
    config:
      huggingface_repo: ${env.SAFETY_MODEL}
      api_token: ${env.HF_API_TOKEN}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: hf-serverless
  provider_model_id: null
 - metadata: {}
  model_id: ${env.SAFETY_MODEL}
  provider_id: hf-serverless-safety
  provider_model_id: null
 shields:
 - params: null
  shield_id: ${env.SAFETY_MODEL}
  provider_id: null
  provider_shield_id: null
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -0,0 +1,55 @@
 version: '2'
 image_name: hf-serverless
 docker_image: null
 conda_env: hf-serverless
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: hf-serverless
    provider_type: remote::hf::serverless
    config:
      huggingface_repo: ${env.INFERENCE_MODEL}
      api_token: ${env.HF_API_TOKEN}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: hf-serverless
  provider_model_id: null
 shields: []
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/inline-vllm/build.yaml
+++ b/llama_stack/templates/inline-vllm/build.yaml
@ -1,13 +0,0 @@
 name: meta-reference-gpu
 distribution_spec:
  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: inline::meta-reference
    memory:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
    safety: inline::llama-guard
    agents: inline::meta-reference
    telemetry: inline::meta-reference
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -66,8 +62,8 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
-llama stack build --template meta-reference-gpu --image-type conda
+llama stack build --template {{ name }} --image-type conda
-llama stack run ./run.yaml \
+llama stack run distributions/{{ name }}/run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -75,7 +71,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/{{ name }}/run-with-safety.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--- a/llama_stack/templates/meta-reference-quantized-gpu/init.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .meta_reference import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@ -0,0 +1,54 @@
 # Meta Reference Quantized Distribution
 The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
 | **API**         	| **Inference**            	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
 |-----------------	|------------------------  	|----------------	|--------------------------------------------------	|----------------	|----------------	|
 | **Provider(s)** 	| meta-reference-quantized  | meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
 The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
 ### Step 0. Prerequisite - Downloading Models
 Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
 ```
 $ ls ~/.llama/checkpoints
 Llama3.2-3B-Instruct:int4-qlora-eo8
 ```
 ### Step 1. Start the Distribution
 #### (Option 1) Start with Docker
 ```
 $ cd distributions/meta-reference-quantized-gpu && docker compose up
 ```
 > [!NOTE]
 > This assumes you have access to GPU to start a local server with access to your GPU.
 > [!NOTE]
 > `~/.llama` should be the path containing downloaded weights of Llama models.
 This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
 ```
 docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
 ```
 #### (Option 2) Start with Conda
 1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
 2. Build the `meta-reference-quantized-gpu` distribution
 ```
 $ llama stack build --template meta-reference-quantized-gpu --image-type conda
 ```
 3. Start running distribution
 ```
 $ cd distributions/meta-reference-quantized-gpu
 $ llama stack run ./run.yaml
 ```
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@ -0,0 +1,100 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.inference.meta_reference import (
    MetaReferenceInferenceConfig,
 )
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["inline::meta-reference"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="meta-reference-inference",
        provider_type="inline::meta-reference",
        config=MetaReferenceInferenceConfig.sample_run_config(
            model="${env.INFERENCE_MODEL}",
            checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
        ),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="meta-reference-inference",
    )
    safety_model = ModelInput(
        model_id="${env.SAFETY_MODEL}",
        provider_id="meta-reference-safety",
    )
    return DistributionTemplate(
        name="meta-reference-gpu",
        distro_type="self_hosted",
        description="Use Meta Reference for running LLM inference",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=[inference_model],
            ),
            "run-with-safety.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [
                        inference_provider,
                        Provider(
                            provider_id="meta-reference-safety",
                            provider_type="inline::meta-reference",
                            config=MetaReferenceInferenceConfig.sample_run_config(
                                model="${env.SAFETY_MODEL}",
                                checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
                            ),
                        ),
                    ],
                },
                default_models=[
                    inference_model,
                    safety_model,
                ],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
                "Inference model loaded into the Meta Reference server",
            ),
            "INFERENCE_CHECKPOINT_DIR": (
                "null",
                "Directory containing the Meta Reference model checkpoint",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
                "Name of the safety (Llama-Guard) model to use",
            ),
            "SAFETY_CHECKPOINT_DIR": (
                "null",
                "Directory containing the Llama-Guard model checkpoint",
            ),
        },
    )
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -55,9 +55,7 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 export LLAMA_STACK_PORT=5001
-llama stack build --template ollama --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 class RunConfigSettings(BaseModel):
    provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
-    default_models: List[ModelInput]
+    default_models: Optional[List[ModelInput]] = None
    default_shields: Optional[List[ShieldInput]] = None
    def run_config(
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
                __distro_dir__=f"distributions/{name}",
                db_name="registry.db",
            ),
-            models=self.default_models,
+            models=self.default_models or [],
            shields=self.default_shields or [],
        )
@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
    providers: Dict[str, List[str]]
    run_configs: Dict[str, RunConfigSettings]
-    template_path: Path
+    template_path: Optional[Path] = None
    # Optional configuration
    run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
            with open(yaml_output_dir / yaml_pth, "w") as f:
                yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
-        docs = self.generate_markdown_docs()
+        if self.template_path:
-        with open(doc_output_dir / f"{self.name}.md", "w") as f:
+            docs = self.generate_markdown_docs()
-            f.write(docs if docs.endswith("\n") else docs + "\n")
+            with open(doc_output_dir / f"{self.name}.md", "w") as f:
                f.write(docs if docs.endswith("\n") else docs + "\n")
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml
-  --port 5001
+  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
-llama stack run ./run-with-safety.yaml
+llama stack run ./run-with-safety.yaml \
-  --port 5001
+  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
-  --env SAFETY_MODEL=$SAFETY_MODEL
+  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@ -53,8 +51,8 @@ docker run \
 ### Via Conda
 ```bash
-llama stack build --template together --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
--- a/llama_stack/templates/vllm-gpu/init.py
+++ b/llama_stack/templates/vllm-gpu/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .vllm import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@ -0,0 +1,19 @@
 version: '2'
 name: vllm-gpu
 distribution_spec:
  description: Use a built-in vLLM engine for running LLM inference
  docker_image: null
  providers:
    inference:
    - inline::vllm
    memory:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
    safety:
    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -0,0 +1,58 @@
 version: '2'
 image_name: vllm-gpu
 docker_image: null
 conda_env: vllm-gpu
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: vllm
    provider_type: inline::vllm
    config:
      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
      max_tokens: ${env.MAX_TOKENS:4096}
      enforce_eager: ${env.ENFORCE_EAGER:False}
      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm
  provider_model_id: null
 shields: []
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@ -0,0 +1,74 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.datatypes import ModelInput, Provider
 from llama_stack.providers.inline.inference.vllm import VLLMConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["inline::vllm"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="vllm",
        provider_type="inline::vllm",
        config=VLLMConfig.sample_run_config(),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="vllm",
    )
    return DistributionTemplate(
        name="vllm-gpu",
        distro_type="self_hosted",
        description="Use a built-in vLLM engine for running LLM inference",
        docker_image=None,
        template_path=None,
        providers=providers,
        default_models=[inference_model],
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=[inference_model],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
                "Inference model loaded into the vLLM engine",
            ),
            "TENSOR_PARALLEL_SIZE": (
                "1",
                "Number of tensor parallel replicas (number of GPUs to use).",
            ),
            "MAX_TOKENS": (
                "4096",
                "Maximum number of tokens to generate.",
            ),
            "ENFORCE_EAGER": (
                "False",
                "Whether to use eager mode for inference (otherwise cuda graphs are used).",
            ),
            "GPU_MEMORY_UTILIZATION": (
                "0.7",
                "GPU memory utilization for the vLLM engine.",
            ),
        },
    )
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/bedrock/run.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/databricks/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/hf-endpoint/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/hf-serverless/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/ollama/build.yaml`