Update more distribution docs to be simpler and partially codegen'ed

2025-06-27 18:50:41 +00:00 · 2024-11-20 14:44:04 -08:00 · 2024-11-20 14:44:04 -08:00 · 2411a44833
commit 2411a44833
parent e84d4436b5
51 changed files with 1188 additions and 291 deletions
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1,45 +0,0 @@
-version: '2'
-image_name: local
-name: bedrock
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-    - provider_id: bedrock0
-      provider_type: remote::bedrock
-      config:
-        aws_access_key_id: <AWS_ACCESS_KEY_ID>
-        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-        aws_session_token: <AWS_SESSION_TOKEN>
-        region_name: <AWS_REGION>
-  memory:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config: {}
-  safety:
-    - provider_id: bedrock0
-      provider_type: remote::bedrock
-      config:
-        aws_access_key_id: <AWS_ACCESS_KEY_ID>
-        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-        aws_session_token: <AWS_SESSION_TOKEN>
-        region_name: <AWS_REGION>
-  agents:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config:
-        persistence_store:
-          type: sqlite
-          db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config: {}
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/databricks/build.yaml
+++ b/distributions/databricks/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/databricks/build.yaml
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,4 +1,32 @@
 {
+  "hf-serverless": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "together": [
    "aiosqlite",
    "blobfile",
@ -26,6 +54,33 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
@ -108,6 +163,33 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "bedrock": [
+    "aiosqlite",
+    "blobfile",
+    "boto3",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
@ -167,5 +249,33 @@
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-endpoint": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/distributions/hf-endpoint/build.yaml
+++ b/distributions/hf-endpoint/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/hf-endpoint/build.yaml
--- a/distributions/hf-serverless/build.yaml
+++ b/distributions/hf-serverless/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/hf-serverless/build.yaml
--- a/distributions/ollama-gpu/build.yaml
+++ b/distributions/ollama-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama-gpu/compose.yaml
+++ b/distributions/ollama-gpu/compose.yaml
@ -1,48 +0,0 @@
-services:
-  ollama:
-    image: ollama/ollama:latest
-    network_mode: "host"
-    volumes:
-      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
-    ports:
-      - "11434:11434"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-    - ollama
-    image: llamastack/distribution-ollama
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to ollama run.yaml file
-      - ./run.yaml:/root/llamastack-run-ollama.yaml
-    ports:
-      - "5000:5000"
-    # Hack: wait for ollama server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-volumes:
-  ollama:
--- a/distributions/ollama-gpu/run.yaml
+++ b/distributions/ollama-gpu/run.yaml
@ -1,46 +0,0 @@
-version: '2'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: ollama
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-models:
-  - model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
-    provider_id: ollama
-  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: ollama
-shields:
-  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
--- a/distributions/inline-vllm/build.yaml
+++ b/distributions/inline-vllm/build.yaml
--- a/distributions/inline-vllm/compose.yaml
+++ b/distributions/inline-vllm/compose.yaml
--- a/distributions/inline-vllm/run.yaml
+++ b/distributions/inline-vllm/run.yaml
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -6,59 +6,58 @@
 self
 ```

-### Connect to a Llama Stack Bedrock Endpoint
- You may connect to Amazon Bedrock APIs for running LLM inference
+The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:

-The `llamastack/distribution-bedrock` distribution consists of the following provider configurations.
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| inference | `remote::bedrock` |
+| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+| safety | `remote::bedrock` |
+| telemetry | `inline::meta-reference` |


-| **API**         	| **Inference** 	| **Agents**     	| **Memory**     	| **Safety**     	| **Telemetry**  	|
-|-----------------	|---------------	|----------------	|----------------	|----------------	|----------------	|
-| **Provider(s)** 	| remote::bedrock | meta-reference 	| meta-reference 	| remote::bedrock | meta-reference 	|
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)


-### Docker: Start the Distribution (Single Node CPU)

-> [!NOTE]
-> This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
+### Prerequisite: API Keys

-```
-$ cd distributions/bedrock && docker compose up
+Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
+
+
+## Running Llama Stack with AWS Bedrock
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-bedrock \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```

-Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g.
-```
-inference:
-  - provider_id: bedrock0
-    provider_type: remote::bedrock
-    config:
-      aws_access_key_id: <AWS_ACCESS_KEY_ID>
-      aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-      aws_session_token: <AWS_SESSION_TOKEN>
-      region_name: <AWS_REGION>
-```
-
-### Conda llama stack run (Single Node CPU)
+### Via Conda

 ```bash
 llama stack build --template bedrock --image-type conda
-# -- modify run.yaml with valid AWS credentials
-llama stack run ./run.yaml
-```
-
-### (Optional) Update Model Serving Configuration
-
-Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
-
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | meta.llama3-1-8b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | meta.llama3-1-70b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | meta.llama3-1-405b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-fireworks \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@ -70,6 +68,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-meta-reference-gpu \
-  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-meta-reference-gpu \
-  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a

 ```bash
 llama stack build --template meta-reference-gpu --image-type conda
-llama stack run ./run.yaml \
+llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -89,7 +85,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -66,9 +66,7 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-ollama \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-tgi \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template tgi --image-type conda
 llama stack run ./run.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-llama stack run ./run-with-safety.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
-  --env SAFETY_MODEL=$SAFETY_MODEL
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-together \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@ -69,6 +67,6 @@ docker run \
 ```bash
 llama stack build --template together --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
    @classmethod
    def sample_run_config(cls):
        return {
-            "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-            "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
-            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
-            "enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
+            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
+            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
+            "max_tokens": "${env.MAX_TOKENS:4096}",
+            "enforce_eager": "${env.ENFORCE_EAGER:False}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
        }

    @field_validator("model")
--- a/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/llama_stack/providers/remote/inference/bedrock/config.py
@ -4,11 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_models.schema_utils import json_schema_type
-
 from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig


-@json_schema_type
 class BedrockConfig(BedrockBaseConfig):
    pass
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )

+    @classmethod
+    def sample_run_config(
+        cls,
+        endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
+        api_token: str = "${env.HF_API_TOKEN}",
+        **kwargs,
+    ):
+        return {
+            "endpoint_name": endpoint_name,
+            "api_token": api_token,
+        }
+

@json_schema_type
 class InferenceAPIImplConfig(BaseModel):
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
        default=None,
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        repo: str = "${env.INFERENCE_MODEL}",
+        api_token: str = "${env.HF_API_TOKEN}",
+        **kwargs,
+    ):
+        return {
+            "huggingface_repo": repo,
+            "api_token": api_token,
+        }
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -5,11 +5,9 @@
 # the root directory of this source tree.
 from typing import Optional

-from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field


-@json_schema_type
 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: Optional[str] = Field(
        default=None,
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
        default=3600,
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs):
+        return {}
--- a/llama_stack/templates/bedrock/init.py
+++ b/llama_stack/templates/bedrock/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .bedrock import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::bedrock"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["remote::bedrock"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    return DistributionTemplate(
+        name="bedrock",
+        distro_type="self_hosted",
+        description="Use AWS Bedrock for running LLM inference and safety",
+        docker_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[],
+        run_configs={
+            "run.yaml": RunConfigSettings(),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+        },
+    )
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@ -1,9 +1,19 @@
+version: '2'
 name: bedrock
 distribution_spec:
-  description: Use Amazon Bedrock APIs.
+  description: Use AWS Bedrock for running LLM inference and safety
+  docker_image: null
  providers:
-    inference: remote::bedrock
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::bedrock
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - remote::bedrock
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/bedrock/doc_template.md
+++ b/llama_stack/templates/bedrock/doc_template.md
@ -0,0 +1,63 @@
+# Bedrock Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
+
+{{ providers_table }}
+
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} ({{ model.provider_model_id }})`
+{% endfor %}
+{% endif %}
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
+
+
+## Running Llama Stack with AWS Bedrock
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
+```
+
+### Via Conda
+
+```bash
+llama stack build --template {{ name }} --image-type conda
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
+```
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -0,0 +1,49 @@
+version: '2'
+image_name: bedrock
+docker_image: null
+conda_env: bedrock
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config: {}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
+  safety:
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
+models: []
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/databricks/build.yaml
+++ b/llama_stack/templates/databricks/build.yaml
@ -1,9 +0,0 @@
-name: databricks
-distribution_spec:
-  description: Use Databricks for running LLM inference
-  providers:
-    inference: remote::databricks
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: meta-reference
-    telemetry: meta-reference
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@ -55,6 +53,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
--- a/llama_stack/templates/hf-endpoint/init.py
+++ b/llama_stack/templates/hf-endpoint/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .hf_endpoint import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@ -1,9 +1,19 @@
+version: '2'
 name: hf-endpoint
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
+  docker_image: null
  providers:
-    inference: remote::hf::endpoint
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::hf::endpoint
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::hf::endpoint"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="hf-endpoint",
+        provider_type="remote::hf::endpoint",
+        config=InferenceEndpointImplConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="hf-endpoint",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="hf-endpoint-safety",
+    )
+
+    return DistributionTemplate(
+        name="hf-endpoint",
+        distro_type="self_hosted",
+        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="hf-endpoint-safety",
+                            provider_type="remote::hf::endpoint",
+                            config=InferenceEndpointImplConfig.sample_run_config(
+                                endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
+                            ),
+                        ),
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "HF_API_TOKEN": (
+                "hf_...",
+                "Hugging Face API token",
+            ),
+            "INFERENCE_ENDPOINT_NAME": (
+                "",
+                "HF Inference endpoint name for the main inference model",
+            ),
+            "SAFETY_INFERENCE_ENDPOINT_NAME": (
+                "",
+                "HF Inference endpoint for the safety model",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model served by the HF Inference Endpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Safety model served by the HF Inference Endpoint",
+            ),
+        },
+    )
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -0,0 +1,68 @@
+version: '2'
+image_name: hf-endpoint
+docker_image: null
+conda_env: hf-endpoint
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-endpoint
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  - provider_id: hf-endpoint-safety
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-endpoint
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: hf-endpoint-safety
+  provider_model_id: null
+shields:
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -0,0 +1,55 @@
+version: '2'
+image_name: hf-endpoint
+docker_image: null
+conda_env: hf-endpoint
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-endpoint
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-endpoint
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/hf-serverless/init.py
+++ b/llama_stack/templates/hf-serverless/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .hf_serverless import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@ -1,9 +1,19 @@
+version: '2'
 name: hf-serverless
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
+  docker_image: null
  providers:
-    inference: remote::hf::serverless
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::hf::serverless
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::hf::serverless"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="hf-serverless",
+        provider_type="remote::hf::serverless",
+        config=InferenceAPIImplConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="hf-serverless",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="hf-serverless-safety",
+    )
+
+    return DistributionTemplate(
+        name="hf-serverless",
+        distro_type="self_hosted",
+        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="hf-serverless-safety",
+                            provider_type="remote::hf::serverless",
+                            config=InferenceAPIImplConfig.sample_run_config(
+                                repo="${env.SAFETY_MODEL}",
+                            ),
+                        ),
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "HF_API_TOKEN": (
+                "hf_...",
+                "Hugging Face API token",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model to be served by the HF Serverless endpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Safety model to be served by the HF Serverless endpoint",
+            ),
+        },
+    )
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -0,0 +1,68 @@
+version: '2'
+image_name: hf-serverless
+docker_image: null
+conda_env: hf-serverless
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-serverless
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.INFERENCE_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  - provider_id: hf-serverless-safety
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.SAFETY_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-serverless
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: hf-serverless-safety
+  provider_model_id: null
+shields:
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -0,0 +1,55 @@
+version: '2'
+image_name: hf-serverless
+docker_image: null
+conda_env: hf-serverless
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-serverless
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.INFERENCE_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-serverless
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/inline-vllm/build.yaml
+++ b/llama_stack/templates/inline-vllm/build.yaml
@ -1,13 +0,0 @@
-name: meta-reference-gpu
-distribution_spec:
-  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
-  description: Use code from `llama_stack` itself to serve all llama stack APIs
-  providers:
-    inference: inline::meta-reference
-    memory:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -66,8 +62,8 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack build --template meta-reference-gpu --image-type conda
-llama stack run ./run.yaml \
+llama stack build --template {{ name }} --image-type conda
+llama stack run distributions/{{ name }}/run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@ -75,7 +71,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/{{ name }}/run-with-safety.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--- a/llama_stack/templates/meta-reference-quantized-gpu/init.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .meta_reference import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@ -0,0 +1,54 @@
+# Meta Reference Quantized Distribution
+
+The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference**            	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|------------------------  	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| meta-reference-quantized  | meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
+
+The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
+
+### Step 0. Prerequisite - Downloading Models
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
+
+```
+$ ls ~/.llama/checkpoints
+Llama3.2-3B-Instruct:int4-qlora-eo8
+```
+
+### Step 1. Start the Distribution
+#### (Option 1) Start with Docker
+```
+$ cd distributions/meta-reference-quantized-gpu && docker compose up
+```
+
+> [!NOTE]
+> This assumes you have access to GPU to start a local server with access to your GPU.
+
+
+> [!NOTE]
+> `~/.llama` should be the path containing downloaded weights of Llama models.
+
+
+This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
+
+```
+docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
+```
+
+#### (Option 2) Start with Conda
+
+1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+
+2. Build the `meta-reference-quantized-gpu` distribution
+
+```
+$ llama stack build --template meta-reference-quantized-gpu --image-type conda
+```
+
+3. Start running distribution
+```
+$ cd distributions/meta-reference-quantized-gpu
+$ llama stack run ./run.yaml
+```
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.inline.inference.meta_reference import (
+    MetaReferenceInferenceConfig,
+)
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["inline::meta-reference"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="meta-reference-inference",
+        provider_type="inline::meta-reference",
+        config=MetaReferenceInferenceConfig.sample_run_config(
+            model="${env.INFERENCE_MODEL}",
+            checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
+        ),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="meta-reference-inference",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="meta-reference-safety",
+    )
+
+    return DistributionTemplate(
+        name="meta-reference-gpu",
+        distro_type="self_hosted",
+        description="Use Meta Reference for running LLM inference",
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="meta-reference-safety",
+                            provider_type="inline::meta-reference",
+                            config=MetaReferenceInferenceConfig.sample_run_config(
+                                model="${env.SAFETY_MODEL}",
+                                checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
+                            ),
+                        ),
+                    ],
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the Meta Reference server",
+            ),
+            "INFERENCE_CHECKPOINT_DIR": (
+                "null",
+                "Directory containing the Meta Reference model checkpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Name of the safety (Llama-Guard) model to use",
+            ),
+            "SAFETY_CHECKPOINT_DIR": (
+                "null",
+                "Directory containing the Llama-Guard model checkpoint",
+            ),
+        },
+    )
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -55,9 +55,7 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 export LLAMA_STACK_PORT=5001

-llama stack build --template ollama --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig

 class RunConfigSettings(BaseModel):
    provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
-    default_models: List[ModelInput]
+    default_models: Optional[List[ModelInput]] = None
    default_shields: Optional[List[ShieldInput]] = None

    def run_config(
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
                __distro_dir__=f"distributions/{name}",
                db_name="registry.db",
            ),
-            models=self.default_models,
+            models=self.default_models or [],
            shields=self.default_shields or [],
        )

@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):

    providers: Dict[str, List[str]]
    run_configs: Dict[str, RunConfigSettings]
-    template_path: Path
+    template_path: Optional[Path] = None

    # Optional configuration
    run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
            with open(yaml_output_dir / yaml_pth, "w") as f:
                yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)

-        docs = self.generate_markdown_docs()
-        with open(doc_output_dir / f"{self.name}.md", "w") as f:
-            f.write(docs if docs.endswith("\n") else docs + "\n")
+        if self.template_path:
+            docs = self.generate_markdown_docs()
+            with open(doc_output_dir / f"{self.name}.md", "w") as f:
+                f.write(docs if docs.endswith("\n") else docs + "\n")
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-llama stack run ./run-with-safety.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
-  --env SAFETY_MODEL=$SAFETY_MODEL
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@ -53,8 +51,8 @@ docker run \
 ### Via Conda

 ```bash
-llama stack build --template together --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
--- a/llama_stack/templates/vllm-gpu/init.py
+++ b/llama_stack/templates/vllm-gpu/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .vllm import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@ -0,0 +1,19 @@
+version: '2'
+name: vllm-gpu
+distribution_spec:
+  description: Use a built-in vLLM engine for running LLM inference
+  docker_image: null
+  providers:
+    inference:
+    - inline::vllm
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -0,0 +1,58 @@
+version: '2'
+image_name: vllm-gpu
+docker_image: null
+conda_env: vllm-gpu
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: inline::vllm
+    config:
+      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      enforce_eager: ${env.ENFORCE_EAGER:False}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider
+from llama_stack.providers.inline.inference.vllm import VLLMConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["inline::vllm"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="vllm",
+        provider_type="inline::vllm",
+        config=VLLMConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="vllm",
+    )
+
+    return DistributionTemplate(
+        name="vllm-gpu",
+        distro_type="self_hosted",
+        description="Use a built-in vLLM engine for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the vLLM engine",
+            ),
+            "TENSOR_PARALLEL_SIZE": (
+                "1",
+                "Number of tensor parallel replicas (number of GPUs to use).",
+            ),
+            "MAX_TOKENS": (
+                "4096",
+                "Maximum number of tokens to generate.",
+            ),
+            "ENFORCE_EAGER": (
+                "False",
+                "Whether to use eager mode for inference (otherwise cuda graphs are used).",
+            ),
+            "GPU_MEMORY_UTILIZATION": (
+                "0.7",
+                "GPU memory utilization for the vLLM engine.",
+            ),
+        },
+    )
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/databricks/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/hf-endpoint/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/hf-serverless/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/build.yaml`