From ae49a4cb9792e2f017a9f6cc34c065cde185df1d Mon Sep 17 00:00:00 2001
From: Justin Lee <justinlee38@outlook.com>
Date: Wed, 20 Nov 2024 10:27:29 -0800
Subject: [PATCH 01/11] Reorganizing Zero to Hero Folder structure (#447)

Putting Zero to Hero Guide to root for increased visibility
---
 .../00_Inference101.ipynb                                | 8 --------
 .../01_Local_Cloud_Inference101.ipynb                    | 8 --------
 .../02_Prompt_Engineering101.ipynb                       | 8 --------
 .../03_Image_Chat101.ipynb                               | 8 --------
 .../04_Tool_Calling101.ipynb                             | 7 -------
 .../05_Memory101.ipynb                                   | 7 -------
 .../06_Safety101.ipynb                                   | 9 +--------
 .../07_Agents101.ipynb                                   | 7 -------
 ..._Calling101_Using_Together's_Llama_Stack_Server.ipynb | 0
 .../quickstart.md                                        | 0
 10 files changed, 1 insertion(+), 61 deletions(-)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/00_Inference101.ipynb (97%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/01_Local_Cloud_Inference101.ipynb (95%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/02_Prompt_Engineering101.ipynb (96%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/03_Image_Chat101.ipynb (96%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/04_Tool_Calling101.ipynb (98%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/05_Memory101.ipynb (99%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/06_Safety101.ipynb (95%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/07_Agents101.ipynb (99%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb (100%)
 rename {docs/zero_to_hero_guide => zero_to_hero_guide}/quickstart.md (100%)
diff --git a/docs/zero_to_hero_guide/00_Inference101.ipynb b/zero_to_hero_guide/00_Inference101.ipynb
similarity index 97%
rename from docs/zero_to_hero_guide/00_Inference101.ipynb
rename to zero_to_hero_guide/00_Inference101.ipynb
index 8bc2de2db..4da0d0df1 100644
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/zero_to_hero_guide/00_Inference101.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "5af4f44e",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/00_Inference101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "c1e7571c",
diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
similarity index 95%
rename from docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
rename to zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
index 030bc6171..7225f0741 100644
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "785bd3ff",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "a0ed972d",
diff --git a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb b/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
similarity index 96%
rename from docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
rename to zero_to_hero_guide/02_Prompt_Engineering101.ipynb
index bbd315ccc..4ff28e470 100644
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d2bf5275",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "cd96f85a",
diff --git a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb b/zero_to_hero_guide/03_Image_Chat101.ipynb
similarity index 96%
rename from docs/zero_to_hero_guide/03_Image_Chat101.ipynb
rename to zero_to_hero_guide/03_Image_Chat101.ipynb
index 3f3cc8d2a..f90605a5a 100644
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/zero_to_hero_guide/03_Image_Chat101.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "6323a6be",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/03_Image_Chat101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "923343b0-d4bd-4361-b8d4-dd29f86a0fbd",
diff --git a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb b/zero_to_hero_guide/04_Tool_Calling101.ipynb
similarity index 98%
rename from docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
rename to zero_to_hero_guide/04_Tool_Calling101.ipynb
index 7aad7bab6..43378170f 100644
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/zero_to_hero_guide/04_Tool_Calling101.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/docs/zero_to_hero_guide/05_Memory101.ipynb b/zero_to_hero_guide/05_Memory101.ipynb
similarity index 99%
rename from docs/zero_to_hero_guide/05_Memory101.ipynb
rename to zero_to_hero_guide/05_Memory101.ipynb
index c7c51c7fd..92e287bef 100644
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/zero_to_hero_guide/05_Memory101.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/05_Memory101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/docs/zero_to_hero_guide/06_Safety101.ipynb b/zero_to_hero_guide/06_Safety101.ipynb
similarity index 95%
rename from docs/zero_to_hero_guide/06_Safety101.ipynb
rename to zero_to_hero_guide/06_Safety101.ipynb
index f5352627e..73ddab4a2 100644
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/zero_to_hero_guide/06_Safety101.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/06_Safety101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -18,7 +11,7 @@
     "As outlined in our [Responsible Use Guide](https://www.llama.com/docs/how-to-guides/responsible-use-guide-resources/), LLM apps should deploy appropriate system level safeguards to mitigate safety and security risks of LLM system, similar to the following diagram:\n",
     "\n",
     "<div>\n",
-    "<img src=\"../_static/safety_system.webp\" alt=\"Figure 1: Safety System\" width=\"1000\"/>\n",
+    "<img src=\"/docs/_static/safety_system.webp\" alt=\"Figure 1: Safety System\" width=\"1000\"/>\n",
     "</div>\n",
     "To that goal, Llama Stack uses **Prompt Guard** and **Llama Guard 3** to secure our system. Here are the quick introduction about them.\n"
    ]
diff --git a/docs/zero_to_hero_guide/07_Agents101.ipynb b/zero_to_hero_guide/07_Agents101.ipynb
similarity index 99%
rename from docs/zero_to_hero_guide/07_Agents101.ipynb
rename to zero_to_hero_guide/07_Agents101.ipynb
index 40a797602..11f54fe68 100644
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/zero_to_hero_guide/07_Agents101.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/07_Agents101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb b/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb
similarity index 100%
rename from docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb
rename to zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb
diff --git a/docs/zero_to_hero_guide/quickstart.md b/zero_to_hero_guide/quickstart.md
similarity index 100%
rename from docs/zero_to_hero_guide/quickstart.md
rename to zero_to_hero_guide/quickstart.md

From 91e7efbc91c729d74c5cf9b3947d3e8acc1fbb71 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Wed, 20 Nov 2024 10:30:23 -0800
Subject: [PATCH 02/11] fall to back to read from chroma/pgvector when not in
 cache (#489)

# What does this PR do?

The chroma provider maintains a cache but does not sync up with chroma
on a cold start. this change adds a fallback to read from chroma on a
cache miss.


## Test Plan
```bash
#start stack
llama stack run /Users/dineshyv/.llama/distributions/llamastack-together/together-run.yaml
# Add documents
PYTHONPATH=. python -m examples.agents.rag_with_memory_bank localhost 5000

No available shields. Disable safety.
Using model: Llama3.1-8B-Instruct
Created session_id=b951b14f-a9d2-43a3-8b80-d80114d58322 for Agent(0687a251-6906-4081-8d4c-f52e19db9dd7)
memory_retrieval> Retrieved context from banks: ['test_bank'].
====
Here are the retrieved documents for relevant context:
=== START-RETRIEVED-CONTEXT ===
 id:num-1; content:_
the template from Llama2 to better support multiturn conversations. The same text
in the Lla...
>
inference> Based on the retrieved documentation, the top 5 topics that were explained are:
...............

# Kill stack
# Bootup stack
llama stack run /Users/dineshyv/.llama/distributions/llamastack-together/together-run.yaml
# Run a RAG app with just the agent flow. it discovers the previously added documents
No available shields. Disable safety.
Using model: Llama3.1-8B-Instruct
Created session_id=7a30c1a7-c87e-4787-936c-d0306589fe5d for Agent(b30420f3-c928-498a-887b-d084f0f3806c)
memory_retrieval> Retrieved context from banks: ['test_bank'].
====
Here are the retrieved documents for relevant context:
=== START-RETRIEVED-CONTEXT ===
 id:num-1; content:_
the template from Llama2 to better support multiturn conversations. The same text
in the Lla...
>
inference> Based on the provided documentation, the top 5 topics that were explained are:
.....
```
---
 .../providers/remote/memory/chroma/chroma.py  | 22 ++++++++++++++-----
 .../remote/memory/pgvector/pgvector.py        | 22 ++++++++++++-------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/llama_stack/providers/remote/memory/chroma/chroma.py b/llama_stack/providers/remote/memory/chroma/chroma.py
index ac00fc749..3ccd6a534 100644
--- a/llama_stack/providers/remote/memory/chroma/chroma.py
+++ b/llama_stack/providers/remote/memory/chroma/chroma.py
@@ -147,9 +147,7 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
         documents: List[MemoryBankDocument],
         ttl_seconds: Optional[int] = None,
     ) -> None:
-        index = self.cache.get(bank_id, None)
-        if not index:
-            raise ValueError(f"Bank {bank_id} not found")
+        index = await self._get_and_cache_bank_index(bank_id)
 
         await index.insert_documents(documents)
 
@@ -159,8 +157,20 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
         query: InterleavedTextMedia,
         params: Optional[Dict[str, Any]] = None,
     ) -> QueryDocumentsResponse:
-        index = self.cache.get(bank_id, None)
-        if not index:
-            raise ValueError(f"Bank {bank_id} not found")
+        index = await self._get_and_cache_bank_index(bank_id)
 
         return await index.query_documents(query, params)
+
+    async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
+        if bank_id in self.cache:
+            return self.cache[bank_id]
+
+        bank = await self.memory_bank_store.get_memory_bank(bank_id)
+        if not bank:
+            raise ValueError(f"Bank {bank_id} not found in Llama Stack")
+        collection = await self.client.get_collection(bank_id)
+        if not collection:
+            raise ValueError(f"Bank {bank_id} not found in Chroma")
+        index = BankWithIndex(bank=bank, index=ChromaIndex(self.client, collection))
+        self.cache[bank_id] = index
+        return index
diff --git a/llama_stack/providers/remote/memory/pgvector/pgvector.py b/llama_stack/providers/remote/memory/pgvector/pgvector.py
index 44c2a8fe1..bd27509d6 100644
--- a/llama_stack/providers/remote/memory/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/memory/pgvector/pgvector.py
@@ -201,10 +201,7 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
         documents: List[MemoryBankDocument],
         ttl_seconds: Optional[int] = None,
     ) -> None:
-        index = self.cache.get(bank_id, None)
-        if not index:
-            raise ValueError(f"Bank {bank_id} not found")
-
+        index = await self._get_and_cache_bank_index(bank_id)
         await index.insert_documents(documents)
 
     async def query_documents(
@@ -213,8 +210,17 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
         query: InterleavedTextMedia,
         params: Optional[Dict[str, Any]] = None,
     ) -> QueryDocumentsResponse:
-        index = self.cache.get(bank_id, None)
-        if not index:
-            raise ValueError(f"Bank {bank_id} not found")
-
+        index = await self._get_and_cache_bank_index(bank_id)
         return await index.query_documents(query, params)
+
+    async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
+        if bank_id in self.cache:
+            return self.cache[bank_id]
+
+        bank = await self.memory_bank_store.get_memory_bank(bank_id)
+        index = BankWithIndex(
+            bank=bank,
+            index=PGVectorIndex(bank, ALL_MINILM_L6_V2_DIMENSION, self.cursor),
+        )
+        self.cache[bank_id] = index
+        return index

From 1d8d0593afb3fe54b4f1c0a1f30117910d4e88be Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Wed, 20 Nov 2024 11:05:50 -0800
Subject: [PATCH 03/11] register with provider even if present in stack (#491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Remove a check which skips provider registration if a resource is
already in stack registry. Since we do not reconcile state with
provider, register should always call into provider's register endpoint.


## Test Plan
```
# stack run
╰─❯ llama stack run /Users/dineshyv/.llama/distributions/llamastack-together/together-run.yaml

#register memory bank
❯ llama-stack-client memory_banks register your_memory_bank_name --type vector --provider-id inline::faiss-0

Memory Bank Configuration:
{
│   'memory_bank_type': 'vector',
│   'chunk_size_in_tokens': 512,
│   'embedding_model': 'all-MiniLM-L6-v2',
│   'overlap_size_in_tokens': 64
}

#register again
❯ llama-stack-client memory_banks register your_memory_bank_name --type vector --provider-id inline::faiss-0

Memory Bank Configuration:
{
│   'memory_bank_type': 'vector',
│   'chunk_size_in_tokens': 512,
│   'embedding_model': 'all-MiniLM-L6-v2',
│   'overlap_size_in_tokens': 64
}
```
---
 llama_stack/distribution/routers/routing_tables.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 76078e652..4df693b26 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -170,13 +170,6 @@ class CommonRoutingTableImpl(RoutingTable):
         # Get existing objects from registry
         existing_obj = await self.dist_registry.get(obj.type, obj.identifier)
 
-        # Check for existing registration
-        if existing_obj and existing_obj.provider_id == obj.provider_id:
-            print(
-                f"`{obj.identifier}` already registered with `{existing_obj.provider_id}`"
-            )
-            return existing_obj
-
         # if provider_id is not specified, pick an arbitrary one from existing entries
         if not obj.provider_id and len(self.impls_by_provider_id) > 0:
             obj.provider_id = list(self.impls_by_provider_id.keys())[0]

From 681322731b0ae863f4b486b5daf746914a25a361 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 13:11:40 -0800
Subject: [PATCH 04/11] Make run yaml optional so dockers can start with just
 --env (#492)

When running with dockers, the idea is that users be able to work purely
with the `llama stack` CLI. They should not need to know about the
existence of any YAMLs unless they need to. This PR enables it.

The docker command now doesn't need to volume mount a yaml and can
simply be:
```bash
docker run -v ~/.llama/:/root/.llama \
  --env A=a --env B=b
```

## Test Plan

Check with conda first (no regressions):
```bash
LLAMA_STACK_DIR=. llama stack build --template ollama
llama stack run ollama --port 5001

# server starts up correctly
```

Check with docker
```bash
# build the docker
LLAMA_STACK_DIR=. llama stack build --template ollama --image-type docker

export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"

docker run -it  -p 5001:5001 \
  -v ~/.llama:/root/.llama \
  -v $PWD:/app/llama-stack-source \
  localhost/distribution-ollama:dev \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
```

Note that volume mounting to `/app/llama-stack-source` is only needed
because we built the docker with uncommitted source code.
---
 llama_stack/cli/stack/run.py                | 20 ++++++++++----
 llama_stack/distribution/build_container.sh |  2 +-
 llama_stack/distribution/server/server.py   | 30 +++++++++++++++++++--
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index c3ea174da..fb4e76d7a 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -5,9 +5,12 @@
 # the root directory of this source tree.
 
 import argparse
+from pathlib import Path
 
 from llama_stack.cli.subcommand import Subcommand
 
+REPO_ROOT = Path(__file__).parent.parent.parent.parent
+
 
 class StackRun(Subcommand):
     def __init__(self, subparsers: argparse._SubParsersAction):
@@ -48,8 +51,6 @@ class StackRun(Subcommand):
         )
 
     def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
-        from pathlib import Path
-
         import pkg_resources
         import yaml
 
@@ -66,19 +67,27 @@ class StackRun(Subcommand):
             return
 
         config_file = Path(args.config)
-        if not config_file.exists() and not args.config.endswith(".yaml"):
+        has_yaml_suffix = args.config.endswith(".yaml")
+
+        if not config_file.exists() and not has_yaml_suffix:
+            # check if this is a template
+            config_file = (
+                Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
+            )
+
+        if not config_file.exists() and not has_yaml_suffix:
             # check if it's a build config saved to conda dir
             config_file = Path(
                 BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
             )
 
-        if not config_file.exists() and not args.config.endswith(".yaml"):
+        if not config_file.exists() and not has_yaml_suffix:
             # check if it's a build config saved to docker dir
             config_file = Path(
                 BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml"
             )
 
-        if not config_file.exists() and not args.config.endswith(".yaml"):
+        if not config_file.exists() and not has_yaml_suffix:
             # check if it's a build config saved to ~/.llama dir
             config_file = Path(
                 DISTRIBS_BASE_DIR
@@ -92,6 +101,7 @@ class StackRun(Subcommand):
             )
             return
 
+        print(f"Using config file: {config_file}")
         config_dict = yaml.safe_load(config_file.read_text())
         config = parse_and_maybe_upgrade_config(config_dict)
 
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 2730ae174..a9aee8f14 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -122,7 +122,7 @@ add_to_docker <<EOF
 # This would be good in production but for debugging flexibility lets not add it right now
 # We need a more solid production ready entrypoint.sh anyway
 #
-ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
+ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$build_name"]
 
 EOF
 
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index fecc41b5d..f0d91f3a6 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -16,6 +16,7 @@ import traceback
 import warnings
 
 from contextlib import asynccontextmanager
+from pathlib import Path
 from ssl import SSLError
 from typing import Any, Dict, Optional
 
@@ -49,6 +50,9 @@ from llama_stack.distribution.stack import (
 from .endpoints import get_all_api_endpoints
 
 
+REPO_ROOT = Path(__file__).parent.parent.parent.parent
+
+
 def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
     log = file if hasattr(file, "write") else sys.stderr
     traceback.print_stack(file=log)
@@ -279,9 +283,12 @@ def main():
     parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
     parser.add_argument(
         "--yaml-config",
-        default="llamastack-run.yaml",
         help="Path to YAML configuration file",
     )
+    parser.add_argument(
+        "--template",
+        help="One of the template names in llama_stack/templates (e.g., tgi, fireworks, remote-vllm, etc.)",
+    )
     parser.add_argument("--port", type=int, default=5000, help="Port to listen on")
     parser.add_argument(
         "--disable-ipv6", action="store_true", help="Whether to disable IPv6 support"
@@ -303,10 +310,29 @@ def main():
                 print(f"Error: {str(e)}")
                 sys.exit(1)
 
-    with open(args.yaml_config, "r") as fp:
+    if args.yaml_config:
+        # if the user provided a config file, use it, even if template was specified
+        config_file = Path(args.yaml_config)
+        if not config_file.exists():
+            raise ValueError(f"Config file {config_file} does not exist")
+        print(f"Using config file: {config_file}")
+    elif args.template:
+        config_file = (
+            Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
+        )
+        if not config_file.exists():
+            raise ValueError(f"Template {args.template} does not exist")
+        print(f"Using template {args.template} config file: {config_file}")
+    else:
+        raise ValueError("Either --yaml-config or --template must be provided")
+
+    with open(config_file, "r") as fp:
         config = replace_env_vars(yaml.safe_load(fp))
         config = StackRunConfig(**config)
 
+    print("Run configuration:")
+    print(yaml.dump(config.model_dump(), indent=2))
+
     app = FastAPI()
 
     try:

From 00816cc8ef4cb72728a1728b80b24bdadc0d080f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 13:55:43 -0800
Subject: [PATCH 05/11] make sure codegen doesn't cause spurious diffs for no
 reason

---
 llama_stack/scripts/distro_codegen.py | 2 +-
 llama_stack/templates/template.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index b82319bd5..84bf9af2a 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -103,7 +103,7 @@ def generate_dependencies_file():
 
     deps_file = REPO_ROOT / "distributions" / "dependencies.json"
     with open(deps_file, "w") as f:
-        json.dump(distribution_deps, f, indent=2)
+        f.write(json.dumps(distribution_deps, indent=2) + "\n")
 
 
 def main():
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index fd37016f8..fe0278718 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -161,4 +161,4 @@ class DistributionTemplate(BaseModel):
 
         docs = self.generate_markdown_docs()
         with open(doc_output_dir / f"{self.name}.md", "w") as f:
-            f.write(docs)
+            f.write(docs if docs.endswith("\n") else docs + "\n")

From 068ac00a3bcb18337a017646234b2a758d1c72b6 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 15:44:49 -0800
Subject: [PATCH 06/11] Don't depend on templates.py when print llama stack
 build messages (#496)

---
 llama_stack/cli/stack/build.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index e9760c9cb..ce1ed2747 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -8,7 +8,6 @@ import argparse
 
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.distribution.datatypes import *  # noqa: F403
-import importlib
 import os
 import shutil
 from functools import lru_cache
@@ -258,6 +257,7 @@ class StackBuild(Subcommand):
     ) -> None:
         import json
         import os
+        import re
 
         import yaml
         from termcolor import cprint
@@ -286,17 +286,19 @@ class StackBuild(Subcommand):
             os.makedirs(build_dir, exist_ok=True)
             run_config_file = build_dir / f"{build_config.name}-run.yaml"
             shutil.copy(template_path, run_config_file)
-            module_name = f"llama_stack.templates.{template_name}"
-            module = importlib.import_module(module_name)
-            distribution_template = module.get_distribution_template()
+
+            with open(template_path, "r") as f:
+                yaml_content = f.read()
+
+            # Find all ${env.VARIABLE} patterns
+            env_vars = set(re.findall(r"\${env\.([A-Za-z0-9_]+)}", yaml_content))
             cprint("Build Successful! Next steps: ", color="green")
-            env_vars = ", ".join(distribution_template.run_config_env_vars.keys())
             cprint(
-                f"   1. Set the environment variables: {env_vars}",
+                f"   1. Set the environment variables: {list(env_vars)}",
                 color="green",
             )
             cprint(
-                f"   2. `llama stack run {run_config_file}`",
+                f"   2. Run: `llama stack run {template_name}`",
                 color="green",
             )
         else:

From b3f9e8b2f2b74f0796c9f6d0ab08f123f4c9924d Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Wed, 20 Nov 2024 15:54:47 -0800
Subject: [PATCH 07/11] Restructure docs (#494)

Rendered docs at: https://llama-stack.readthedocs.io/en/doc-simplify/
---
 docs/.gitignore                               |   1 +
 docs/source/distributions/index.md            | 139 +++++++
 .../distributions/ondevice_distro/index.md    |   0
 .../distributions/ondevice_distro/ios_sdk.md  |   0
 .../remote_hosted_distro/index.md             |   7 +
 .../self_hosted_distro/bedrock.md             |   6 +
 .../self_hosted_distro/dell-tgi.md            |   7 +
 .../self_hosted_distro/fireworks.md           |   7 +
 .../distributions/self_hosted_distro/index.md |  28 +-
 .../self_hosted_distro/meta-reference-gpu.md  |   7 +
 .../meta-reference-quantized-gpu.md           |   7 +
 .../self_hosted_distro/ollama.md              |   7 +
 .../self_hosted_distro/remote-vllm.md         |   7 +
 .../distributions/self_hosted_distro/tgi.md   |   7 +
 .../self_hosted_distro/together.md            |   9 +-
 docs/source/getting_started/index.md          | 370 +++++++++---------
 docs/source/index.md                          |   9 +-
 .../download_models.md                        |   0
 .../index.md                                  |   6 +-
 .../llama_stack_client_cli_reference/index.md | 162 ++++++++
 20 files changed, 586 insertions(+), 200 deletions(-)
 create mode 100644 docs/.gitignore
 create mode 100644 docs/source/distributions/index.md
 rename docs/source/{getting_started => }/distributions/ondevice_distro/index.md (100%)
 rename docs/source/{getting_started => }/distributions/ondevice_distro/ios_sdk.md (100%)
 rename docs/source/{getting_started => }/distributions/remote_hosted_distro/index.md (98%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/bedrock.md (98%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/dell-tgi.md (98%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/fireworks.md (97%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/index.md (63%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/meta-reference-gpu.md (98%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/meta-reference-quantized-gpu.md (97%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/ollama.md (99%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/remote-vllm.md (99%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/tgi.md (98%)
 rename docs/source/{getting_started => }/distributions/self_hosted_distro/together.md (96%)
 rename docs/source/{cli_reference => llama_cli_reference}/download_models.md (100%)
 rename docs/source/{cli_reference => llama_cli_reference}/index.md (98%)
 create mode 100644 docs/source/llama_stack_client_cli_reference/index.md

diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 000000000..85de9cf93
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+src
diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md
new file mode 100644
index 000000000..753555d5b
--- /dev/null
+++ b/docs/source/distributions/index.md
@@ -0,0 +1,139 @@
+# Llama Stack Distributions
+
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self_hosted_distro/index
+remote_hosted_distro/index
+ondevice_distro/index
+```
+## Introduction
+
+Llama Stack Distributions are pre-built Docker containers/Conda environments that assemble APIs and Providers to provide a consistent whole to the end application developer.
+These distributions allow you to mix-and-match providers - some could be backed by local code and some could be remote. This flexibility enables you to choose the optimal setup for your use case, such as serving a small model locally while using a cloud provider for larger models, all while maintaining a consistent API interface for your application.
+
+
+## Decide Your Build Type
+There are two ways to start a Llama Stack:
+
+- **Docker**: we provide a number of pre-built Docker containers allowing you to get started instantly. If you are focused on application development, we recommend this option.
+- **Conda**: the `llama` CLI provides a simple set of commands to build, configure and run a Llama Stack server containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
+
+Both of these provide options to run model inference using our reference implementations, Ollama, TGI, vLLM or even remote providers like Fireworks, Together, Bedrock, etc.
+
+### Decide Your Inference Provider
+
+Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites.
+
+- **Do you have access to a machine with powerful GPUs?**
+If so, we suggest:
+  - [distribution-meta-reference-gpu](./self_hosted_distro/meta-reference-gpu.md)
+  - [distribution-tgi](./self_hosted_distro/tgi.md)
+
+- **Are you running on a "regular" desktop machine?**
+If so, we suggest:
+  - [distribution-ollama](./self_hosted_distro/ollama.md)
+
+- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
+  - [distribution-together](./remote_hosted_distro/together.md)
+  - [distribution-fireworks](./remote_hosted_distro/fireworks.md)
+
+- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
+  - [iOS](./ondevice_distro/ios_sdk.md)
+  - [Android](https://github.com/meta-llama/llama-stack-client-kotlin) (coming soon)
+
+Please see our pages in detail for the types of distributions we offer:
+
+1. [Self-Hosted Distribution](./self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
+2. [Remote-Hosted Distribution](./remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
+3. [On-device Distribution](./ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
+
+## Building Your Own Distribution
+
+### Prerequisites
+
+```bash
+$ git clone git@github.com:meta-llama/llama-stack.git
+```
+
+
+### Starting the Distribution
+
+::::{tab-set}
+
+:::{tab-item} meta-reference-gpu
+##### System Requirements
+Access to Single-Node GPU to start a local server.
+
+##### Downloading Models
+Please make sure you have Llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../cli_reference/download_models.md) here to download the models.
+
+```
+$ ls ~/.llama/checkpoints
+Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
+Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
+```
+
+:::
+
+:::{tab-item} vLLM
+##### System Requirements
+Access to Single-Node GPU to start a vLLM server.
+:::
+
+:::{tab-item} tgi
+##### System Requirements
+Access to Single-Node GPU to start a TGI server.
+:::
+
+:::{tab-item} ollama
+##### System Requirements
+Access to Single-Node CPU/GPU able to run ollama.
+:::
+
+:::{tab-item} together
+##### System Requirements
+Access to Single-Node CPU with Together hosted endpoint via API_KEY from [together.ai](https://api.together.xyz/signin).
+:::
+
+:::{tab-item} fireworks
+##### System Requirements
+Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [fireworks.ai](https://fireworks.ai/).
+:::
+
+::::
+
+
+::::{tab-set}
+:::{tab-item} meta-reference-gpu
+- [Start Meta Reference GPU Distribution](./self_hosted_distro/meta-reference-gpu.md)
+:::
+
+:::{tab-item} vLLM
+- [Start vLLM Distribution](./self_hosted_distro/remote-vllm.md)
+:::
+
+:::{tab-item} tgi
+- [Start TGI Distribution](./self_hosted_distro/tgi.md)
+:::
+
+:::{tab-item} ollama
+- [Start Ollama Distribution](./self_hosted_distro/ollama.md)
+:::
+
+:::{tab-item} together
+- [Start Together Distribution](./self_hosted_distro/together.md)
+:::
+
+:::{tab-item} fireworks
+- [Start Fireworks Distribution](./self_hosted_distro/fireworks.md)
+:::
+
+::::
+
+### Troubleshooting
+
+- If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
+- Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
diff --git a/docs/source/getting_started/distributions/ondevice_distro/index.md b/docs/source/distributions/ondevice_distro/index.md
similarity index 100%
rename from docs/source/getting_started/distributions/ondevice_distro/index.md
rename to docs/source/distributions/ondevice_distro/index.md
diff --git a/docs/source/getting_started/distributions/ondevice_distro/ios_sdk.md b/docs/source/distributions/ondevice_distro/ios_sdk.md
similarity index 100%
rename from docs/source/getting_started/distributions/ondevice_distro/ios_sdk.md
rename to docs/source/distributions/ondevice_distro/ios_sdk.md
diff --git a/docs/source/getting_started/distributions/remote_hosted_distro/index.md b/docs/source/distributions/remote_hosted_distro/index.md
similarity index 98%
rename from docs/source/getting_started/distributions/remote_hosted_distro/index.md
rename to docs/source/distributions/remote_hosted_distro/index.md
index 76d5fdf27..308d29fa1 100644
--- a/docs/source/getting_started/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
@@ -1,5 +1,12 @@
 # Remote-Hosted Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+remote
+```
+
 Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to.
 
 | Distribution | Endpoint | Inference | Agents | Memory | Safety | Telemetry |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md
similarity index 98%
rename from docs/source/getting_started/distributions/self_hosted_distro/bedrock.md
rename to docs/source/distributions/self_hosted_distro/bedrock.md
index 28691d4e3..edef88390 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@@ -1,4 +1,10 @@
 # Bedrock Distribution
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
 
 ### Connect to a Llama Stack Bedrock Endpoint
 - You may connect to Amazon Bedrock APIs for running LLM inference
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/dell-tgi.md b/docs/source/distributions/self_hosted_distro/dell-tgi.md
similarity index 98%
rename from docs/source/getting_started/distributions/self_hosted_distro/dell-tgi.md
rename to docs/source/distributions/self_hosted_distro/dell-tgi.md
index 90d6a87c9..c74cccfe2 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/dell-tgi.md
+++ b/docs/source/distributions/self_hosted_distro/dell-tgi.md
@@ -1,5 +1,12 @@
 # Dell-TGI Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
 
 
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
similarity index 97%
rename from docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
rename to docs/source/distributions/self_hosted_distro/fireworks.md
index cca1155e1..e30bb1480 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -1,5 +1,12 @@
 # Fireworks Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-fireworks` distribution consists of the following provider configurations.
 
 | API | Provider(s) |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/index.md b/docs/source/distributions/self_hosted_distro/index.md
similarity index 63%
rename from docs/source/getting_started/distributions/self_hosted_distro/index.md
rename to docs/source/distributions/self_hosted_distro/index.md
index 502b95cb4..fb775fb52 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/index.md
+++ b/docs/source/distributions/self_hosted_distro/index.md
@@ -1,20 +1,8 @@
 # Self-Hosted Distribution
 
-We offer deployable distributions where you can host your own Llama Stack server using local inference.
-
-| **Distribution** 	|           **Llama Stack Docker**           	| Start This Distribution 	|    **Inference**   	|     **Agents**     	|     **Memory**     	|     **Safety**     	|    **Telemetry**   	|
-|:----------------:	|:------------------------------------------:	|:-----------------------:	|:------------------:	|:------------------:	|:------------------:	|:------------------:	|:------------------:	|
-|  Meta Reference  	| [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)       	| meta-reference 	| meta-reference 	| meta-reference; remote::pgvector; remote::chromadb	| meta-reference 	| meta-reference	|
-|  Meta Reference Quantized  	| [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html)       	| meta-reference-quantized 	| meta-reference 	| meta-reference; remote::pgvector; remote::chromadb	| meta-reference 	| meta-reference	|
-|      Ollama      	|       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)       	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)       	| remote::ollama	| meta-reference 	| remote::pgvector; remote::chromadb 	|  meta-reference 	| meta-reference 	|
-|        TGI       	|         [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)       	| remote::tgi	| meta-reference 	| meta-reference; remote::pgvector; remote::chromadb 	| meta-reference 	| meta-reference 	|
-|        Together       	|         [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)       	| remote::together 	| meta-reference | remote::weaviate | meta-reference 	| meta-reference  	|
-|        Fireworks       	|         [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)       	| remote::fireworks 	| meta-reference | remote::weaviate | meta-reference 	| meta-reference  	|
-|        Bedrock       	|         [llamastack/distribution-bedrock](https://hub.docker.com/repository/docker/llamastack/distribution-bedrock/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/bedrock.html)       	| remote::bedrock 	| meta-reference | remote::weaviate | meta-reference 	| meta-reference  	|
-
-
 ```{toctree}
-:maxdepth: 1
+:maxdepth: 2
+:hidden:
 
 meta-reference-gpu
 meta-reference-quantized-gpu
@@ -26,3 +14,15 @@ fireworks
 remote-vllm
 bedrock
 ```
+
+We offer deployable distributions where you can host your own Llama Stack server using local inference.
+
+| **Distribution** 	|           **Llama Stack Docker**           	| Start This Distribution 	|
+|:----------------:	|:------------------------------------------:	|:-----------------------:	|
+|  Meta Reference  	| [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)       	|
+|  Meta Reference Quantized  	| [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html)       	|
+|      Ollama      	|       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)       	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)       	|
+|        TGI       	|         [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)       	|
+|        Together       	|         [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)       	|
+|        Fireworks       	|         [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)       	|
+|        Bedrock       	|         [llamastack/distribution-bedrock](https://hub.docker.com/repository/docker/llamastack/distribution-bedrock/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/bedrock.html)       	|
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
similarity index 98%
rename from docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md
rename to docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index 74a838d2f..65e1c8cf8 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -1,5 +1,12 @@
 # Meta Reference Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations:
 
 | API | Provider(s) |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
similarity index 97%
rename from docs/source/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
rename to docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index afe1e3e20..7dcc642d5 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -1,5 +1,12 @@
 # Meta Reference Quantized Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
 
 
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
similarity index 99%
rename from docs/source/getting_started/distributions/self_hosted_distro/ollama.md
rename to docs/source/distributions/self_hosted_distro/ollama.md
index d1e9ea67a..fe65172f3 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -1,5 +1,12 @@
 # Ollama Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
 
 | API | Provider(s) |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
similarity index 99%
rename from docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
rename to docs/source/distributions/self_hosted_distro/remote-vllm.md
index 748b98732..235cc1e0f 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -1,5 +1,12 @@
 # Remote vLLM Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:
 
 | API | Provider(s) |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
similarity index 98%
rename from docs/source/getting_started/distributions/self_hosted_distro/tgi.md
rename to docs/source/distributions/self_hosted_distro/tgi.md
index 63631f937..3209b9100 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -1,5 +1,12 @@
 # TGI Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
 
 | API | Provider(s) |
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
similarity index 96%
rename from docs/source/getting_started/distributions/self_hosted_distro/together.md
rename to docs/source/distributions/self_hosted_distro/together.md
index 5d79fcf0c..303c62dcb 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -1,4 +1,11 @@
-# Fireworks Distribution
+# Together Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
 
 The `llamastack/distribution-together` distribution consists of the following provider configurations.
 
diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index 5fc2c5ed8..df91bc493 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -1,194 +1,208 @@
-# Getting Started
+# Getting Started with Llama Stack
 
-```{toctree}
-:maxdepth: 2
-:hidden:
 
-distributions/self_hosted_distro/index
-distributions/remote_hosted_distro/index
-distributions/ondevice_distro/index
+In this guide, we'll walk through using ollama as the inference provider and build a simple python application that uses the Llama Stack Client SDK
+
+Llama stack consists of a distribution server and an accompanying client SDK. The distribution server can be configured for different providers for inference, memory, agents, evals etc. This configuration is defined in a yaml file called `run.yaml`.
+
+Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites. We will use ollama as the inference provider as it is the easiest to get started with.
+
+### Step 1. Start the inference server
+```bash
+export LLAMA_STACK_PORT=5001
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+# ollama names this model differently, and we must use the ollama name when loading the model
+export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
+ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
 ```
 
-At the end of the guide, you will have learned how to:
-- get a Llama Stack server up and running
-- set up an agent (with tool-calling and vector stores) that works with the above server
-
-To see more example apps built using Llama Stack, see [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main).
-
-## Step 1. Starting Up Llama Stack Server
-
-### Decide Your Build Type
-There are two ways to start a Llama Stack:
-
-- **Docker**: we provide a number of pre-built Docker containers allowing you to get started instantly. If you are focused on application development, we recommend this option.
-- **Conda**: the `llama` CLI provides a simple set of commands to build, configure and run a Llama Stack server containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
-
-Both of these provide options to run model inference using our reference implementations, Ollama, TGI, vLLM or even remote providers like Fireworks, Together, Bedrock, etc.
-
-### Decide Your Inference Provider
-
-Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites.
-
-- **Do you have access to a machine with powerful GPUs?**
-If so, we suggest:
-  - [distribution-meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
-  - [distribution-tgi](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/tgi.html)
-
-- **Are you running on a "regular" desktop machine?**
-If so, we suggest:
-  - [distribution-ollama](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
-
-- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
-  - [distribution-together](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html)
-  - [distribution-fireworks](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html)
-
-- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
-  - [iOS](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/ondevice_distro/ios_sdk.html)
-  - [Android](https://github.com/meta-llama/llama-stack-client-kotlin) (coming soon)
-
-Please see our pages in detail for the types of distributions we offer:
-
-1. [Self-Hosted Distribution](./distributions/self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
-2. [Remote-Hosted Distribution](./distributions/remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
-3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
-
-
-### Table of Contents
-
-Once you have decided on the inference provider and distribution to use, use the following guides to get started.
-
-##### 1.0 Prerequisite
-
-```
-$ git clone git@github.com:meta-llama/llama-stack.git
-```
-
-::::{tab-set}
-
-:::{tab-item} meta-reference-gpu
-##### System Requirements
-Access to Single-Node GPU to start a local server.
-
-##### Downloading Models
-Please make sure you have Llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
-
-```
-$ ls ~/.llama/checkpoints
-Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
-Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
-```
-
-:::
-
-:::{tab-item} vLLM
-##### System Requirements
-Access to Single-Node GPU to start a vLLM server.
-:::
-
-:::{tab-item} tgi
-##### System Requirements
-Access to Single-Node GPU to start a TGI server.
-:::
-
-:::{tab-item} ollama
-##### System Requirements
-Access to Single-Node CPU/GPU able to run ollama.
-:::
-
-:::{tab-item} together
-##### System Requirements
-Access to Single-Node CPU with Together hosted endpoint via API_KEY from [together.ai](https://api.together.xyz/signin).
-:::
-
-:::{tab-item} fireworks
-##### System Requirements
-Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [fireworks.ai](https://fireworks.ai/).
-:::
-
-::::
-
-##### 1.1. Start the distribution
-
-::::{tab-set}
-:::{tab-item} meta-reference-gpu
-- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
-:::
-
-:::{tab-item} vLLM
-- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
-:::
-
-:::{tab-item} tgi
-- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
-:::
-
-:::{tab-item} ollama
-- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
-:::
-
-:::{tab-item} together
-- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
-:::
-
-:::{tab-item} fireworks
-- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
-:::
-
-::::
-
-##### Troubleshooting
-- If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
-- Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
-
-
-## Step 2. Run Llama Stack App
-
-### Chat Completion Test
-Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:
+### Step 2. Start the Llama Stack server
 
 ```bash
-$ curl http://localhost:5000/alpha/inference/chat-completion \
--H "Content-Type: application/json" \
--d '{
-    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-    "messages": [
+export LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  llamastack/distribution-ollama \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
+
+```
+
+### Step 3. Use the Llama Stack client SDK
+```bash
+pip install llama-stack-client
+```
+
+We will use the `llama-stack-client` CLI to check the connectivity to the server. This should be installed in your environment if you installed the SDK.
+```bash
+llama-stack-client --endpoint http://localhost:5001 models list
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ identifier                       ┃ provider_id ┃ provider_resource_id      ┃ metadata ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ meta-llama/Llama-3.2-3B-Instruct │ ollama      │ llama3.2:3b-instruct-fp16 │ {}       │
+└──────────────────────────────────┴─────────────┴───────────────────────────┴──────────┘
+```
+
+Chat completion using the CLI
+```bash
+llama-stack-client --endpoint http://localhost:5001 inference chat_completion --message "hello, what model are you?"
+```
+
+Simple python example using the client SDK
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:5001")
+
+# List available models
+models = client.models.list()
+print(models)
+
+# Simple chat completion
+response = client.inference.chat_completion(
+    model_id="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[
         {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Write me a 2 sentence poem about the moon"}
-    ],
-    "sampling_params": {"temperature": 0.7, "seed": 42, "max_tokens": 512}
-}'
-
-Output:
-{'completion_message': {'role': 'assistant',
-  'content': 'The moon glows softly in the midnight sky, \nA beacon of wonder, as it catches the eye.',
-  'stop_reason': 'out_of_tokens',
-  'tool_calls': []},
- 'logprobs': null}
-
+        {"role": "user", "content": "Write a haiku about coding"}
+    ]
+)
+print(response.completion_message.content)
 ```
 
-### Run Agent App
+### Step 4. Your first RAG agent
+```python
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
 
-To run an agent app, check out examples demo scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. To run a simple agent app:
+import asyncio
 
-```bash
-$ git clone git@github.com:meta-llama/llama-stack-apps.git
-$ cd llama-stack-apps
-$ pip install -r requirements.txt
+import fire
 
-$ python -m examples.agents.client <host> <port>
+from llama_stack_client import LlamaStackClient
+from llama_stack_client.lib.agents.agent import Agent
+from llama_stack_client.lib.agents.event_logger import EventLogger
+from llama_stack_client.types import Attachment
+from llama_stack_client.types.agent_create_params import AgentConfig
+
+
+async def run_main(host: str, port: int, disable_safety: bool = False):
+    urls = [
+        "memory_optimizations.rst",
+        "chat.rst",
+        "llama3.rst",
+        "datasets.rst",
+        "qat_finetune.rst",
+        "lora_finetune.rst",
+    ]
+
+    attachments = [
+        Attachment(
+            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+            mime_type="text/plain",
+        )
+        for i, url in enumerate(urls)
+    ]
+
+    client = LlamaStackClient(
+        base_url=f"http://{host}:{port}",
+    )
+
+    available_shields = [shield.identifier for shield in client.shields.list()]
+    if not available_shields:
+        print("No available shields. Disable safety.")
+    else:
+        print(f"Available shields found: {available_shields}")
+    available_models = [model.identifier for model in client.models.list()]
+    if not available_models:
+        raise ValueError("No available models")
+    else:
+        selected_model = available_models[0]
+        print(f"Using model: {selected_model}")
+
+    agent_config = AgentConfig(
+        model=selected_model,
+        instructions="You are a helpful assistant",
+        sampling_params={
+            "strategy": "greedy",
+            "temperature": 1.0,
+            "top_p": 0.9,
+        },
+        tools=[
+            {
+                "type": "memory",
+                "memory_bank_configs": [],
+                "query_generator_config": {"type": "default", "sep": " "},
+                "max_tokens_in_context": 4096,
+                "max_chunks": 10,
+            },
+        ],
+        tool_choice="auto",
+        tool_prompt_format="json",
+        input_shields=available_shields if available_shields else [],
+        output_shields=available_shields if available_shields else [],
+        enable_session_persistence=False,
+    )
+
+    agent = Agent(client, agent_config)
+    session_id = agent.create_session("test-session")
+    print(f"Created session_id={session_id} for Agent({agent.agent_id})")
+
+    user_prompts = [
+        (
+            "I am attaching some documentation for Torchtune. Help me answer questions I will ask next.",
+            attachments,
+        ),
+        (
+            "What are the top 5 topics that were explained? Only list succinct bullet points.",
+            None,
+        ),
+        (
+            "Was anything related to 'Llama3' discussed, if so what?",
+            None,
+        ),
+        (
+            "Tell me how to use LoRA",
+            None,
+        ),
+        (
+            "What about Quantization?",
+            None,
+        ),
+    ]
+
+    for prompt in user_prompts:
+        response = agent.create_turn(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt[0],
+                }
+            ],
+            attachments=prompt[1],
+            session_id=session_id,
+        )
+
+        async for log in EventLogger().log(response):
+            log.print()
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
 ```
 
-You will see outputs of the form --
-```
-User> I am planning a trip to Switzerland, what are the top 3 places to visit?
-inference> Switzerland is a beautiful country with a rich history, stunning landscapes, and vibrant culture. Here are three must-visit places to add to your itinerary:
-...
+## Next Steps
 
-User> What is so special about #1?
-inference> Jungfraujoch, also known as the "Top of Europe," is a unique and special place for several reasons:
-...
+- You can mix and match different providers for inference, memory, agents, evals etc. See [Building custom distributions](../distributions/index.md)
+- [Developer Cookbook](developer_cookbook.md)
 
-User> What other countries should I consider to club?
-inference> Considering your interest in Switzerland, here are some neighboring countries that you may want to consider visiting:
-```
+For example applications and more detailed tutorials, visit our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository.
diff --git a/docs/source/index.md b/docs/source/index.md
index a53952be7..f73020623 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -7,8 +7,7 @@ The Stack APIs are rapidly improving but still a work-in-progress. We invite fee
 
 ```{image} ../_static/llama-stack.png
 :alt: Llama Stack
-:width: 600px
-:align: center
+:width: 400px
 ```
 
 ## APIs
@@ -86,8 +85,10 @@ You can find more example scripts with client SDKs to talk with the Llama Stack
 :maxdepth: 3
 
 getting_started/index
-cli_reference/index
-cli_reference/download_models
+distributions/index
+llama_cli_reference/index
+llama_cli_reference/download_models
+llama_stack_client_cli_reference/index
 api_providers/index
 distribution_dev/index
 ```
diff --git a/docs/source/cli_reference/download_models.md b/docs/source/llama_cli_reference/download_models.md
similarity index 100%
rename from docs/source/cli_reference/download_models.md
rename to docs/source/llama_cli_reference/download_models.md
diff --git a/docs/source/cli_reference/index.md b/docs/source/llama_cli_reference/index.md
similarity index 98%
rename from docs/source/cli_reference/index.md
rename to docs/source/llama_cli_reference/index.md
index 39c566e59..aa2ecebf7 100644
--- a/docs/source/cli_reference/index.md
+++ b/docs/source/llama_cli_reference/index.md
@@ -1,4 +1,4 @@
-# CLI Reference
+# llama CLI Reference
 
 The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package.
 
@@ -119,7 +119,7 @@ You should see a table like this:
 
 To download models, you can use the llama download command.
 
-#### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
+### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
 
 Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
 
@@ -137,7 +137,7 @@ llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
 llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
 ```
 
-#### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
+### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
 
 Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
 
diff --git a/docs/source/llama_stack_client_cli_reference/index.md b/docs/source/llama_stack_client_cli_reference/index.md
new file mode 100644
index 000000000..62a639acd
--- /dev/null
+++ b/docs/source/llama_stack_client_cli_reference/index.md
@@ -0,0 +1,162 @@
+# llama-stack-client CLI Reference
+
+You may use the `llama-stack-client` to query information about the distribution.
+
+## Basic Commands
+
+### `llama-stack-client`
+```bash
+$ llama-stack-client -h
+
+usage: llama-stack-client [-h] {models,memory_banks,shields} ...
+
+Welcome to the LlamaStackClient CLI
+
+options:
+  -h, --help            show this help message and exit
+
+subcommands:
+  {models,memory_banks,shields}
+```
+
+### `llama-stack-client configure`
+```bash
+$ llama-stack-client configure
+> Enter the host name of the Llama Stack distribution server: localhost
+> Enter the port number of the Llama Stack distribution server: 5000
+Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
+```
+
+## Provider Commands
+
+### `llama-stack-client providers list`
+```bash
+$ llama-stack-client providers list
+```
+```
++-----------+----------------+-----------------+
+| API       | Provider ID    | Provider Type   |
++===========+================+=================+
+| scoring   | meta0          | meta-reference  |
++-----------+----------------+-----------------+
+| datasetio | meta0          | meta-reference  |
++-----------+----------------+-----------------+
+| inference | tgi0           | remote::tgi     |
++-----------+----------------+-----------------+
+| memory    | meta-reference | meta-reference  |
++-----------+----------------+-----------------+
+| agents    | meta-reference | meta-reference  |
++-----------+----------------+-----------------+
+| telemetry | meta-reference | meta-reference  |
++-----------+----------------+-----------------+
+| safety    | meta-reference | meta-reference  |
++-----------+----------------+-----------------+
+```
+
+## Model Management
+
+### `llama-stack-client models list`
+```bash
+$ llama-stack-client models list
+```
+```
++----------------------+----------------------+---------------+----------------------------------------------------------+
+| identifier           | llama_model          | provider_id   | metadata                                                 |
++======================+======================+===============+==========================================================+
+| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | tgi0          | {'huggingface_repo': 'meta-llama/Llama-3.1-8B-Instruct'} |
++----------------------+----------------------+---------------+----------------------------------------------------------+
+```
+
+### `llama-stack-client models get`
+```bash
+$ llama-stack-client models get Llama3.1-8B-Instruct
+```
+
+```
++----------------------+----------------------+----------------------------------------------------------+---------------+
+| identifier           | llama_model          | metadata                                                 | provider_id   |
++======================+======================+==========================================================+===============+
+| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | {'huggingface_repo': 'meta-llama/Llama-3.1-8B-Instruct'} | tgi0          |
++----------------------+----------------------+----------------------------------------------------------+---------------+
+```
+
+
+```bash
+$ llama-stack-client models get Random-Model
+
+Model RandomModel is not found at distribution endpoint host:port. Please ensure endpoint is serving specified model.
+```
+
+### `llama-stack-client models register`
+
+```bash
+$ llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
+```
+
+### `llama-stack-client models update`
+
+```bash
+$ llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
+```
+
+### `llama-stack-client models delete`
+
+```bash
+$ llama-stack-client models delete <model_id>
+```
+
+## Memory Bank Management
+
+### `llama-stack-client memory_banks list`
+```bash
+$ llama-stack-client memory_banks list
+```
+```
++--------------+----------------+--------+-------------------+------------------------+--------------------------+
+| identifier   | provider_id    | type   | embedding_model   |   chunk_size_in_tokens |   overlap_size_in_tokens |
++==============+================+========+===================+========================+==========================+
+| test_bank    | meta-reference | vector | all-MiniLM-L6-v2  |                    512 |                       64 |
++--------------+----------------+--------+-------------------+------------------------+--------------------------+
+```
+
+## Shield Management
+
+### `llama-stack-client shields list`
+```bash
+$ llama-stack-client shields list
+```
+
+```
++--------------+----------+----------------+-------------+
+| identifier   | params   | provider_id    | type        |
++==============+==========+================+=============+
+| llama_guard  | {}       | meta-reference | llama_guard |
++--------------+----------+----------------+-------------+
+```
+
+## Evaluation Tasks
+
+### `llama-stack-client eval_tasks list`
+```bash
+$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
+```
+
+where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
+```
+$ cat ~/eval_task_config.json
+{
+    "type": "benchmark",
+    "eval_candidate": {
+        "type": "model",
+        "model": "Llama3.1-405B-Instruct",
+        "sampling_params": {
+            "strategy": "greedy",
+            "temperature": 0,
+            "top_p": 0.95,
+            "top_k": 0,
+            "max_tokens": 0,
+            "repetition_penalty": 1.0
+        }
+    }
+}
+```

From e84d4436b51260b2ad42cea2df5eeccc4f6fe9b6 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 16:14:37 -0800
Subject: [PATCH 08/11] Since we are pushing for HF repos, we should accept
 them in inference configs (#497)

# What does this PR do?

As the title says.

## Test Plan

This needs
https://github.com/meta-llama/llama-models/commit/8752149f58654c54c012209f43b57bb476146f0c
to also land. So the next package (0.0.54) will make this work properly.

The test is:

```bash
pytest -v -s -m "llama_3b and meta_reference" test_model_registration.py
```
---
 .../providers/inline/inference/meta_reference/config.py    | 6 ++++--
 llama_stack/providers/inline/inference/vllm/config.py      | 7 +++++--
 .../providers/tests/inference/test_model_registration.py   | 1 -
 llama_stack/providers/utils/inference/__init__.py          | 4 ++--
 llama_stack/providers/utils/inference/prompt_adapter.py    | 4 +++-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llama_stack/providers/inline/inference/meta_reference/config.py b/llama_stack/providers/inline/inference/meta_reference/config.py
index 11648b117..4713e7f99 100644
--- a/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/llama_stack/providers/inline/inference/meta_reference/config.py
@@ -37,8 +37,10 @@ class MetaReferenceInferenceConfig(BaseModel):
     @classmethod
     def validate_model(cls, model: str) -> str:
         permitted_models = supported_inference_models()
-        if model not in permitted_models:
-            model_list = "\n\t".join(permitted_models)
+        descriptors = [m.descriptor() for m in permitted_models]
+        repos = [m.huggingface_repo for m in permitted_models]
+        if model not in (descriptors + repos):
+            model_list = "\n\t".join(repos)
             raise ValueError(
                 f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
             )
diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py
index e5516673c..8a95298f4 100644
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@@ -48,8 +48,11 @@ class VLLMConfig(BaseModel):
     @classmethod
     def validate_model(cls, model: str) -> str:
         permitted_models = supported_inference_models()
-        if model not in permitted_models:
-            model_list = "\n\t".join(permitted_models)
+
+        descriptors = [m.descriptor() for m in permitted_models]
+        repos = [m.huggingface_repo for m in permitted_models]
+        if model not in (descriptors + repos):
+            model_list = "\n\t".join(repos)
             raise ValueError(
                 f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
             )
diff --git a/llama_stack/providers/tests/inference/test_model_registration.py b/llama_stack/providers/tests/inference/test_model_registration.py
index 07100c982..1471bc369 100644
--- a/llama_stack/providers/tests/inference/test_model_registration.py
+++ b/llama_stack/providers/tests/inference/test_model_registration.py
@@ -11,7 +11,6 @@ import pytest
 #
 # pytest -v -s llama_stack/providers/tests/inference/test_model_registration.py
 #   -m "meta_reference"
-#   --env TOGETHER_API_KEY=<your_api_key>
 
 
 class TestModelRegistration:
diff --git a/llama_stack/providers/utils/inference/__init__.py b/llama_stack/providers/utils/inference/__init__.py
index 7d268ed38..d204f98a4 100644
--- a/llama_stack/providers/utils/inference/__init__.py
+++ b/llama_stack/providers/utils/inference/__init__.py
@@ -22,9 +22,9 @@ def is_supported_safety_model(model: Model) -> bool:
     ]
 
 
-def supported_inference_models() -> List[str]:
+def supported_inference_models() -> List[Model]:
     return [
-        m.descriptor()
+        m
         for m in all_registered_models()
         if (
             m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2}
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index 2df04664f..6e4d0752e 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -178,7 +178,9 @@ def chat_completion_request_to_messages(
         cprint(f"Could not resolve model {llama_model}", color="red")
         return request.messages
 
-    if model.descriptor() not in supported_inference_models():
+    allowed_models = supported_inference_models()
+    descriptors = [m.descriptor() for m in allowed_models]
+    if model.descriptor() not in descriptors:
         cprint(f"Unsupported inference model? {model.descriptor()}", color="red")
         return request.messages
 

From 2411a44833a61026ec18dbf625b484c826b24eea Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 14:44:04 -0800
Subject: [PATCH 09/11] Update more distribution docs to be simpler and
 partially codegen'ed

---
 distributions/bedrock/run.yaml                |  46 +-------
 distributions/databricks/build.yaml           |   1 -
 distributions/dependencies.json               | 110 ++++++++++++++++++
 distributions/hf-endpoint/build.yaml          |   1 -
 distributions/hf-serverless/build.yaml        |   1 -
 distributions/ollama-gpu/build.yaml           |   1 -
 distributions/ollama-gpu/compose.yaml         |  48 --------
 distributions/ollama-gpu/run.yaml             |  46 --------
 .../{inline-vllm => vllm-gpu}/build.yaml      |   0
 .../{inline-vllm => vllm-gpu}/compose.yaml    |   0
 .../{inline-vllm => vllm-gpu}/run.yaml        |   0
 .../self_hosted_distro/bedrock.md             |  85 +++++++-------
 .../self_hosted_distro/fireworks.md           |   4 +-
 .../self_hosted_distro/meta-reference-gpu.md  |   8 +-
 .../self_hosted_distro/ollama.md              |   2 -
 .../distributions/self_hosted_distro/tgi.md   |  16 ++-
 .../self_hosted_distro/together.md            |   4 +-
 .../providers/inline/inference/vllm/config.py |  10 +-
 .../remote/inference/bedrock/config.py        |   3 -
 .../providers/remote/inference/tgi/config.py  |  24 ++++
 llama_stack/providers/utils/bedrock/config.py |   6 +-
 llama_stack/templates/bedrock/__init__.py     |   7 ++
 llama_stack/templates/bedrock/bedrock.py      |  38 ++++++
 llama_stack/templates/bedrock/build.yaml      |  22 +++-
 llama_stack/templates/bedrock/doc_template.md |  63 ++++++++++
 llama_stack/templates/bedrock/run.yaml        |  49 ++++++++
 llama_stack/templates/databricks/build.yaml   |   9 --
 .../templates/fireworks/doc_template.md       |   4 +-
 llama_stack/templates/hf-endpoint/__init__.py |   7 ++
 llama_stack/templates/hf-endpoint/build.yaml  |  22 +++-
 .../templates/hf-endpoint/hf_endpoint.py      |  97 +++++++++++++++
 .../hf-endpoint/run-with-safety.yaml          |  68 +++++++++++
 llama_stack/templates/hf-endpoint/run.yaml    |  55 +++++++++
 .../templates/hf-serverless/__init__.py       |   7 ++
 .../templates/hf-serverless/build.yaml        |  22 +++-
 .../templates/hf-serverless/hf_serverless.py  |  89 ++++++++++++++
 .../hf-serverless/run-with-safety.yaml        |  68 +++++++++++
 llama_stack/templates/hf-serverless/run.yaml  |  55 +++++++++
 llama_stack/templates/inline-vllm/build.yaml  |  13 ---
 .../meta-reference-gpu/doc_template.md        |  10 +-
 .../meta-reference-quantized-gpu/__init__.py  |   7 ++
 .../doc_template.md                           |  54 +++++++++
 .../meta_reference.py                         | 100 ++++++++++++++++
 llama_stack/templates/ollama/doc_template.md  |   4 +-
 llama_stack/templates/template.py             |  13 ++-
 llama_stack/templates/tgi/doc_template.md     |  16 ++-
 .../templates/together/doc_template.md        |   6 +-
 llama_stack/templates/vllm-gpu/__init__.py    |   7 ++
 llama_stack/templates/vllm-gpu/build.yaml     |  19 +++
 llama_stack/templates/vllm-gpu/run.yaml       |  58 +++++++++
 llama_stack/templates/vllm-gpu/vllm.py        |  74 ++++++++++++
 51 files changed, 1188 insertions(+), 291 deletions(-)
 mode change 100644 => 120000 distributions/bedrock/run.yaml
 delete mode 120000 distributions/databricks/build.yaml
 delete mode 120000 distributions/hf-endpoint/build.yaml
 delete mode 120000 distributions/hf-serverless/build.yaml
 delete mode 120000 distributions/ollama-gpu/build.yaml
 delete mode 100644 distributions/ollama-gpu/compose.yaml
 delete mode 100644 distributions/ollama-gpu/run.yaml
 rename distributions/{inline-vllm => vllm-gpu}/build.yaml (100%)
 rename distributions/{inline-vllm => vllm-gpu}/compose.yaml (100%)
 rename distributions/{inline-vllm => vllm-gpu}/run.yaml (100%)
 create mode 100644 llama_stack/templates/bedrock/__init__.py
 create mode 100644 llama_stack/templates/bedrock/bedrock.py
 create mode 100644 llama_stack/templates/bedrock/doc_template.md
 create mode 100644 llama_stack/templates/bedrock/run.yaml
 delete mode 100644 llama_stack/templates/databricks/build.yaml
 create mode 100644 llama_stack/templates/hf-endpoint/__init__.py
 create mode 100644 llama_stack/templates/hf-endpoint/hf_endpoint.py
 create mode 100644 llama_stack/templates/hf-endpoint/run-with-safety.yaml
 create mode 100644 llama_stack/templates/hf-endpoint/run.yaml
 create mode 100644 llama_stack/templates/hf-serverless/__init__.py
 create mode 100644 llama_stack/templates/hf-serverless/hf_serverless.py
 create mode 100644 llama_stack/templates/hf-serverless/run-with-safety.yaml
 create mode 100644 llama_stack/templates/hf-serverless/run.yaml
 delete mode 100644 llama_stack/templates/inline-vllm/build.yaml
 create mode 100644 llama_stack/templates/meta-reference-quantized-gpu/__init__.py
 create mode 100644 llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
 create mode 100644 llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
 create mode 100644 llama_stack/templates/vllm-gpu/__init__.py
 create mode 100644 llama_stack/templates/vllm-gpu/build.yaml
 create mode 100644 llama_stack/templates/vllm-gpu/run.yaml
 create mode 100644 llama_stack/templates/vllm-gpu/vllm.py

diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml
deleted file mode 100644
index 2f7cb36ef..000000000
--- a/distributions/bedrock/run.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-version: '2'
-image_name: local
-name: bedrock
-docker_image: null
-conda_env: local
-apis:
-- shields
-- agents
-- models
-- memory
-- memory_banks
-- inference
-- safety
-providers:
-  inference:
-    - provider_id: bedrock0
-      provider_type: remote::bedrock
-      config:
-        aws_access_key_id: <AWS_ACCESS_KEY_ID>
-        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-        aws_session_token: <AWS_SESSION_TOKEN>
-        region_name: <AWS_REGION>
-  memory:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config: {}
-  safety:
-    - provider_id: bedrock0
-      provider_type: remote::bedrock
-      config:
-        aws_access_key_id: <AWS_ACCESS_KEY_ID>
-        aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-        aws_session_token: <AWS_SESSION_TOKEN>
-        region_name: <AWS_REGION>
-  agents:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config:
-        persistence_store:
-          type: sqlite
-          db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-    - provider_id: meta0
-      provider_type: inline::meta-reference
-      config: {}
diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml
new file mode 120000
index 000000000..f38abfc4e
--- /dev/null
+++ b/distributions/bedrock/run.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/run.yaml
\ No newline at end of file
diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml
deleted file mode 120000
index 66342fe6f..000000000
--- a/distributions/databricks/build.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../llama_stack/templates/databricks/build.yaml
\ No newline at end of file
diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 92ebd1105..e7506537f 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,4 +1,32 @@
 {
+  "hf-serverless": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "together": [
     "aiosqlite",
     "blobfile",
@@ -26,6 +54,33 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "remote-vllm": [
     "aiosqlite",
     "blobfile",
@@ -108,6 +163,33 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
+  "bedrock": [
+    "aiosqlite",
+    "blobfile",
+    "boto3",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "meta-reference-gpu": [
     "accelerate",
     "aiosqlite",
@@ -167,5 +249,33 @@
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-endpoint": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml
deleted file mode 120000
index a73c70c05..000000000
--- a/distributions/hf-endpoint/build.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../llama_stack/templates/hf-endpoint/build.yaml
\ No newline at end of file
diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml
deleted file mode 120000
index f2db0fd55..000000000
--- a/distributions/hf-serverless/build.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../llama_stack/templates/hf-serverless/build.yaml
\ No newline at end of file
diff --git a/distributions/ollama-gpu/build.yaml b/distributions/ollama-gpu/build.yaml
deleted file mode 120000
index 8772548e0..000000000
--- a/distributions/ollama-gpu/build.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../llama_stack/templates/ollama/build.yaml
\ No newline at end of file
diff --git a/distributions/ollama-gpu/compose.yaml b/distributions/ollama-gpu/compose.yaml
deleted file mode 100644
index c965c43c7..000000000
--- a/distributions/ollama-gpu/compose.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-services:
-  ollama:
-    image: ollama/ollama:latest
-    network_mode: "host"
-    volumes:
-      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
-    ports:
-      - "11434:11434"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-    - ollama
-    image: llamastack/distribution-ollama
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to ollama run.yaml file
-      - ./run.yaml:/root/llamastack-run-ollama.yaml
-    ports:
-      - "5000:5000"
-    # Hack: wait for ollama server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-volumes:
-  ollama:
diff --git a/distributions/ollama-gpu/run.yaml b/distributions/ollama-gpu/run.yaml
deleted file mode 100644
index 25471c69f..000000000
--- a/distributions/ollama-gpu/run.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-version: '2'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
-- shields
-- agents
-- models
-- memory
-- memory_banks
-- inference
-- safety
-providers:
-  inference:
-  - provider_id: ollama
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-models:
-  - model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
-    provider_id: ollama
-  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: ollama
-shields:
-  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
diff --git a/distributions/inline-vllm/build.yaml b/distributions/vllm-gpu/build.yaml
similarity index 100%
rename from distributions/inline-vllm/build.yaml
rename to distributions/vllm-gpu/build.yaml
diff --git a/distributions/inline-vllm/compose.yaml b/distributions/vllm-gpu/compose.yaml
similarity index 100%
rename from distributions/inline-vllm/compose.yaml
rename to distributions/vllm-gpu/compose.yaml
diff --git a/distributions/inline-vllm/run.yaml b/distributions/vllm-gpu/run.yaml
similarity index 100%
rename from distributions/inline-vllm/run.yaml
rename to distributions/vllm-gpu/run.yaml
diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md
index edef88390..1b88b01cc 100644
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@@ -6,59 +6,58 @@
 self
 ```
 
-### Connect to a Llama Stack Bedrock Endpoint
-- You may connect to Amazon Bedrock APIs for running LLM inference
+The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
 
-The `llamastack/distribution-bedrock` distribution consists of the following provider configurations.
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| inference | `remote::bedrock` |
+| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+| safety | `remote::bedrock` |
+| telemetry | `inline::meta-reference` |
 
 
-| **API**         	| **Inference** 	| **Agents**     	| **Memory**     	| **Safety**     	| **Telemetry**  	|
-|-----------------	|---------------	|----------------	|----------------	|----------------	|----------------	|
-| **Provider(s)** 	| remote::bedrock | meta-reference 	| meta-reference 	| remote::bedrock | meta-reference 	|
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 
 
-### Docker: Start the Distribution (Single Node CPU)
 
-> [!NOTE]
-> This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
+### Prerequisite: API Keys
 
-```
-$ cd distributions/bedrock && docker compose up
+Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
+
+
+## Running Llama Stack with AWS Bedrock
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-bedrock \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
 
-Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g.
-```
-inference:
-  - provider_id: bedrock0
-    provider_type: remote::bedrock
-    config:
-      aws_access_key_id: <AWS_ACCESS_KEY_ID>
-      aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
-      aws_session_token: <AWS_SESSION_TOKEN>
-      region_name: <AWS_REGION>
-```
-
-### Conda llama stack run (Single Node CPU)
+### Via Conda
 
 ```bash
 llama stack build --template bedrock --image-type conda
-# -- modify run.yaml with valid AWS credentials
-llama stack run ./run.yaml
-```
-
-### (Optional) Update Model Serving Configuration
-
-Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
-
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | meta.llama3-1-8b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | meta.llama3-1-70b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | meta.llama3-1-405b-instruct-v1:0 | bedrock0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
 ```
diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index e30bb1480..096eee4f5 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-fireworks \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@@ -70,6 +68,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
   --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index 65e1c8cf8..702f0ae0f 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-meta-reference-gpu \
-  /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-meta-reference-gpu \
-  /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 
 ```bash
 llama stack build --template meta-reference-gpu --image-type conda
-llama stack run ./run.yaml \
+llama stack run distributions/meta-reference-gpu/run.yaml \
   --port 5001 \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@@ -89,7 +85,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
   --port 5001 \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index fe65172f3..16c936f9e 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -66,9 +66,7 @@ docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-ollama \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env OLLAMA_URL=http://host.docker.internal:11434
diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
index 3209b9100..a2315a770 100644
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-tgi \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template tgi --image-type conda
 llama stack run ./run.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
-llama stack run ./run-with-safety.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
-  --env SAFETY_MODEL=$SAFETY_MODEL
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
   --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index 303c62dcb..6e392c1e0 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-together \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@@ -69,6 +67,6 @@ docker run \
 ```bash
 llama stack build --template together --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
   --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py
index 8a95298f4..42b75332f 100644
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
     @classmethod
     def sample_run_config(cls):
         return {
-            "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-            "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
-            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
-            "enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
+            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
+            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
+            "max_tokens": "${env.MAX_TOKENS:4096}",
+            "enforce_eager": "${env.ENFORCE_EAGER:False}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
         }
 
     @field_validator("model")
diff --git a/llama_stack/providers/remote/inference/bedrock/config.py b/llama_stack/providers/remote/inference/bedrock/config.py
index 8e194700c..f2e8930be 100644
--- a/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/llama_stack/providers/remote/inference/bedrock/config.py
@@ -4,11 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_models.schema_utils import json_schema_type
-
 from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
 
 
-@json_schema_type
 class BedrockConfig(BedrockBaseConfig):
     pass
diff --git a/llama_stack/providers/remote/inference/tgi/config.py b/llama_stack/providers/remote/inference/tgi/config.py
index 55bda4179..230eaacab 100644
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
         description="Your Hugging Face user access token (will default to locally saved token if not provided)",
     )
 
+    @classmethod
+    def sample_run_config(
+        cls,
+        endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
+        api_token: str = "${env.HF_API_TOKEN}",
+        **kwargs,
+    ):
+        return {
+            "endpoint_name": endpoint_name,
+            "api_token": api_token,
+        }
+
 
 @json_schema_type
 class InferenceAPIImplConfig(BaseModel):
@@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
         default=None,
         description="Your Hugging Face user access token (will default to locally saved token if not provided)",
     )
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        repo: str = "${env.INFERENCE_MODEL}",
+        api_token: str = "${env.HF_API_TOKEN}",
+        **kwargs,
+    ):
+        return {
+            "huggingface_repo": repo,
+            "api_token": api_token,
+        }
diff --git a/llama_stack/providers/utils/bedrock/config.py b/llama_stack/providers/utils/bedrock/config.py
index 55c5582a1..64865bd5f 100644
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@@ -5,11 +5,9 @@
 # the root directory of this source tree.
 from typing import Optional
 
-from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 
 
-@json_schema_type
 class BedrockBaseConfig(BaseModel):
     aws_access_key_id: Optional[str] = Field(
         default=None,
@@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
         default=3600,
         description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
     )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs):
+        return {}
diff --git a/llama_stack/templates/bedrock/__init__.py b/llama_stack/templates/bedrock/__init__.py
new file mode 100644
index 000000000..4e7965550
--- /dev/null
+++ b/llama_stack/templates/bedrock/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .bedrock import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py
new file mode 100644
index 000000000..cf3c342fe
--- /dev/null
+++ b/llama_stack/templates/bedrock/bedrock.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::bedrock"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["remote::bedrock"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    return DistributionTemplate(
+        name="bedrock",
+        distro_type="self_hosted",
+        description="Use AWS Bedrock for running LLM inference and safety",
+        docker_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[],
+        run_configs={
+            "run.yaml": RunConfigSettings(),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml
index c87762043..c73db3eae 100644
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@@ -1,9 +1,19 @@
+version: '2'
 name: bedrock
 distribution_spec:
-  description: Use Amazon Bedrock APIs.
+  description: Use AWS Bedrock for running LLM inference and safety
+  docker_image: null
   providers:
-    inference: remote::bedrock
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::bedrock
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - remote::bedrock
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
diff --git a/llama_stack/templates/bedrock/doc_template.md b/llama_stack/templates/bedrock/doc_template.md
new file mode 100644
index 000000000..9331382b6
--- /dev/null
+++ b/llama_stack/templates/bedrock/doc_template.md
@@ -0,0 +1,63 @@
+# Bedrock Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
+
+{{ providers_table }}
+
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} ({{ model.provider_model_id }})`
+{% endfor %}
+{% endif %}
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
+
+
+## Running Llama Stack with AWS Bedrock
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
+```
+
+### Via Conda
+
+```bash
+llama stack build --template {{ name }} --image-type conda
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
+```
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
new file mode 100644
index 000000000..1f632a1f2
--- /dev/null
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -0,0 +1,49 @@
+version: '2'
+image_name: bedrock
+docker_image: null
+conda_env: bedrock
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config: {}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
+  safety:
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
+models: []
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/databricks/build.yaml b/llama_stack/templates/databricks/build.yaml
deleted file mode 100644
index aa22f54b2..000000000
--- a/llama_stack/templates/databricks/build.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-name: databricks
-distribution_spec:
-  description: Use Databricks for running LLM inference
-  providers:
-    inference: remote::databricks
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: meta-reference
-    telemetry: meta-reference
diff --git a/llama_stack/templates/fireworks/doc_template.md b/llama_stack/templates/fireworks/doc_template.md
index 2a91ece07..2f4be574d 100644
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
@@ -55,6 +53,6 @@ docker run \
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
   --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
diff --git a/llama_stack/templates/hf-endpoint/__init__.py b/llama_stack/templates/hf-endpoint/__init__.py
new file mode 100644
index 000000000..f2c00e3bf
--- /dev/null
+++ b/llama_stack/templates/hf-endpoint/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .hf_endpoint import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml
index 61fd12a2c..798cb3961 100644
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@@ -1,9 +1,19 @@
+version: '2'
 name: hf-endpoint
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
+  docker_image: null
   providers:
-    inference: remote::hf::endpoint
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::hf::endpoint
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py
new file mode 100644
index 000000000..af00114ba
--- /dev/null
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::hf::endpoint"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="hf-endpoint",
+        provider_type="remote::hf::endpoint",
+        config=InferenceEndpointImplConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="hf-endpoint",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="hf-endpoint-safety",
+    )
+
+    return DistributionTemplate(
+        name="hf-endpoint",
+        distro_type="self_hosted",
+        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="hf-endpoint-safety",
+                            provider_type="remote::hf::endpoint",
+                            config=InferenceEndpointImplConfig.sample_run_config(
+                                endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
+                            ),
+                        ),
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "HF_API_TOKEN": (
+                "hf_...",
+                "Hugging Face API token",
+            ),
+            "INFERENCE_ENDPOINT_NAME": (
+                "",
+                "HF Inference endpoint name for the main inference model",
+            ),
+            "SAFETY_INFERENCE_ENDPOINT_NAME": (
+                "",
+                "HF Inference endpoint for the safety model",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model served by the HF Inference Endpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Safety model served by the HF Inference Endpoint",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
new file mode 100644
index 000000000..d518f29b8
--- /dev/null
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -0,0 +1,68 @@
+version: '2'
+image_name: hf-endpoint
+docker_image: null
+conda_env: hf-endpoint
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-endpoint
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  - provider_id: hf-endpoint-safety
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-endpoint
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: hf-endpoint-safety
+  provider_model_id: null
+shields:
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
new file mode 100644
index 000000000..ff4e90606
--- /dev/null
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -0,0 +1,55 @@
+version: '2'
+image_name: hf-endpoint
+docker_image: null
+conda_env: hf-endpoint
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-endpoint
+    provider_type: remote::hf::endpoint
+    config:
+      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-endpoint
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/hf-serverless/__init__.py b/llama_stack/templates/hf-serverless/__init__.py
new file mode 100644
index 000000000..a5f1ab54a
--- /dev/null
+++ b/llama_stack/templates/hf-serverless/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .hf_serverless import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml
index 065a14517..3c03a98c1 100644
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@@ -1,9 +1,19 @@
+version: '2'
 name: hf-serverless
 distribution_spec:
-  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
+  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
+  docker_image: null
   providers:
-    inference: remote::hf::serverless
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    inference:
+    - remote::hf::serverless
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py
new file mode 100644
index 000000000..5434de986
--- /dev/null
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::hf::serverless"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="hf-serverless",
+        provider_type="remote::hf::serverless",
+        config=InferenceAPIImplConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="hf-serverless",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="hf-serverless-safety",
+    )
+
+    return DistributionTemplate(
+        name="hf-serverless",
+        distro_type="self_hosted",
+        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="hf-serverless-safety",
+                            provider_type="remote::hf::serverless",
+                            config=InferenceAPIImplConfig.sample_run_config(
+                                repo="${env.SAFETY_MODEL}",
+                            ),
+                        ),
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "HF_API_TOKEN": (
+                "hf_...",
+                "Hugging Face API token",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model to be served by the HF Serverless endpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Safety model to be served by the HF Serverless endpoint",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
new file mode 100644
index 000000000..e7591bbf0
--- /dev/null
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -0,0 +1,68 @@
+version: '2'
+image_name: hf-serverless
+docker_image: null
+conda_env: hf-serverless
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-serverless
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.INFERENCE_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  - provider_id: hf-serverless-safety
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.SAFETY_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-serverless
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: hf-serverless-safety
+  provider_model_id: null
+shields:
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
new file mode 100644
index 000000000..d7ec02f6a
--- /dev/null
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -0,0 +1,55 @@
+version: '2'
+image_name: hf-serverless
+docker_image: null
+conda_env: hf-serverless
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: hf-serverless
+    provider_type: remote::hf::serverless
+    config:
+      huggingface_repo: ${env.INFERENCE_MODEL}
+      api_token: ${env.HF_API_TOKEN}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: hf-serverless
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/inline-vllm/build.yaml b/llama_stack/templates/inline-vllm/build.yaml
deleted file mode 100644
index 61d9e4db8..000000000
--- a/llama_stack/templates/inline-vllm/build.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: meta-reference-gpu
-distribution_spec:
-  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
-  description: Use code from `llama_stack` itself to serve all llama stack APIs
-  providers:
-    inference: inline::meta-reference
-    memory:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index 9a61ff691..de09efdb0 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@@ -66,8 +62,8 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 
 ```bash
-llama stack build --template meta-reference-gpu --image-type conda
-llama stack run ./run.yaml \
+llama stack build --template {{ name }} --image-type conda
+llama stack run distributions/{{ name }}/run.yaml \
   --port 5001 \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
@@ -75,7 +71,7 @@ llama stack run ./run.yaml \
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
-llama stack run ./run-with-safety.yaml \
+llama stack run distributions/{{ name }}/run-with-safety.yaml \
   --port 5001 \
   --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/__init__.py b/llama_stack/templates/meta-reference-quantized-gpu/__init__.py
new file mode 100644
index 000000000..1cfdb2c6a
--- /dev/null
+++ b/llama_stack/templates/meta-reference-quantized-gpu/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .meta_reference import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
new file mode 100644
index 000000000..afe1e3e20
--- /dev/null
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@@ -0,0 +1,54 @@
+# Meta Reference Quantized Distribution
+
+The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference**            	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|------------------------  	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| meta-reference-quantized  | meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
+
+The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
+
+### Step 0. Prerequisite - Downloading Models
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
+
+```
+$ ls ~/.llama/checkpoints
+Llama3.2-3B-Instruct:int4-qlora-eo8
+```
+
+### Step 1. Start the Distribution
+#### (Option 1) Start with Docker
+```
+$ cd distributions/meta-reference-quantized-gpu && docker compose up
+```
+
+> [!NOTE]
+> This assumes you have access to GPU to start a local server with access to your GPU.
+
+
+> [!NOTE]
+> `~/.llama` should be the path containing downloaded weights of Llama models.
+
+
+This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
+
+```
+docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
+```
+
+#### (Option 2) Start with Conda
+
+1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+
+2. Build the `meta-reference-quantized-gpu` distribution
+
+```
+$ llama stack build --template meta-reference-quantized-gpu --image-type conda
+```
+
+3. Start running distribution
+```
+$ cd distributions/meta-reference-quantized-gpu
+$ llama stack run ./run.yaml
+```
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
new file mode 100644
index 000000000..f254bc920
--- /dev/null
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.inline.inference.meta_reference import (
+    MetaReferenceInferenceConfig,
+)
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["inline::meta-reference"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="meta-reference-inference",
+        provider_type="inline::meta-reference",
+        config=MetaReferenceInferenceConfig.sample_run_config(
+            model="${env.INFERENCE_MODEL}",
+            checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
+        ),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="meta-reference-inference",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="meta-reference-safety",
+    )
+
+    return DistributionTemplate(
+        name="meta-reference-gpu",
+        distro_type="self_hosted",
+        description="Use Meta Reference for running LLM inference",
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        Provider(
+                            provider_id="meta-reference-safety",
+                            provider_type="inline::meta-reference",
+                            config=MetaReferenceInferenceConfig.sample_run_config(
+                                model="${env.SAFETY_MODEL}",
+                                checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
+                            ),
+                        ),
+                    ],
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the Meta Reference server",
+            ),
+            "INFERENCE_CHECKPOINT_DIR": (
+                "null",
+                "Directory containing the Meta Reference model checkpoint",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Name of the safety (Llama-Guard) model to use",
+            ),
+            "SAFETY_CHECKPOINT_DIR": (
+                "null",
+                "Directory containing the Llama-Guard model checkpoint",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index 5a7a0d2f7..09fe8eabc 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -55,9 +55,7 @@ docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env OLLAMA_URL=http://host.docker.internal:11434
@@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 export LLAMA_STACK_PORT=5001
 
-llama stack build --template ollama --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index fe0278718..bf74b95d1 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 
 class RunConfigSettings(BaseModel):
     provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
-    default_models: List[ModelInput]
+    default_models: Optional[List[ModelInput]] = None
     default_shields: Optional[List[ShieldInput]] = None
 
     def run_config(
@@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
                 __distro_dir__=f"distributions/{name}",
                 db_name="registry.db",
             ),
-            models=self.default_models,
+            models=self.default_models or [],
             shields=self.default_shields or [],
         )
 
@@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
 
     providers: Dict[str, List[str]]
     run_configs: Dict[str, RunConfigSettings]
-    template_path: Path
+    template_path: Optional[Path] = None
 
     # Optional configuration
     run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
             with open(yaml_output_dir / yaml_pth, "w") as f:
                 yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
 
-        docs = self.generate_markdown_docs()
-        with open(doc_output_dir / f"{self.name}.md", "w") as f:
-            f.write(docs if docs.endswith("\n") else docs + "\n")
+        if self.template_path:
+            docs = self.generate_markdown_docs()
+            with open(doc_output_dir / f"{self.name}.md", "w") as f:
+                f.write(docs if docs.endswith("\n") else docs + "\n")
diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md
index 0f6001e1a..42124696f 100644
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
-llama stack run ./run-with-safety.yaml
-  --port 5001
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
-  --env SAFETY_MODEL=$SAFETY_MODEL
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
   --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
diff --git a/llama_stack/templates/together/doc_template.md b/llama_stack/templates/together/doc_template.md
index 5c1580dac..3fc94dd35 100644
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
 docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
@@ -53,8 +51,8 @@ docker run \
 ### Via Conda
 
 ```bash
-llama stack build --template together --image-type conda
+llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
   --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
diff --git a/llama_stack/templates/vllm-gpu/__init__.py b/llama_stack/templates/vllm-gpu/__init__.py
new file mode 100644
index 000000000..7b3d59a01
--- /dev/null
+++ b/llama_stack/templates/vllm-gpu/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .vllm import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml
new file mode 100644
index 000000000..6792a855f
--- /dev/null
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@@ -0,0 +1,19 @@
+version: '2'
+name: vllm-gpu
+distribution_spec:
+  description: Use a built-in vLLM engine for running LLM inference
+  docker_image: null
+  providers:
+    inference:
+    - inline::vllm
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
new file mode 100644
index 000000000..a140ad403
--- /dev/null
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -0,0 +1,58 @@
+version: '2'
+image_name: vllm-gpu
+docker_image: null
+conda_env: vllm-gpu
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: inline::vllm
+    config:
+      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      enforce_eager: ${env.ENFORCE_EAGER:False}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py
new file mode 100644
index 000000000..78fcf4f57
--- /dev/null
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider
+from llama_stack.providers.inline.inference.vllm import VLLMConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["inline::vllm"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="vllm",
+        provider_type="inline::vllm",
+        config=VLLMConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="vllm",
+    )
+
+    return DistributionTemplate(
+        name="vllm-gpu",
+        distro_type="self_hosted",
+        description="Use a built-in vLLM engine for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the vLLM engine",
+            ),
+            "TENSOR_PARALLEL_SIZE": (
+                "1",
+                "Number of tensor parallel replicas (number of GPUs to use).",
+            ),
+            "MAX_TOKENS": (
+                "4096",
+                "Maximum number of tokens to generate.",
+            ),
+            "ENFORCE_EAGER": (
+                "False",
+                "Whether to use eager mode for inference (otherwise cuda graphs are used).",
+            ),
+            "GPU_MEMORY_UTILIZATION": (
+                "0.7",
+                "GPU memory utilization for the vLLM engine.",
+            ),
+        },
+    )

From cd6ccb664ccc3960d927772abb5df541e5727ce0 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 23:20:05 -0800
Subject: [PATCH 10/11] Integrate distro docs into the restructured docs

---
 distributions/dependencies.json               | 34 +++++++
 .../self_hosted_distro/bedrock.md             |  1 +
 .../meta-reference-quantized-gpu.md           | 95 ++++++++++++------
 .../self_hosted_distro/remote-vllm.md         |  1 -
 .../inline/inference/meta_reference/config.py | 16 ++-
 llama_stack/scripts/distro_codegen.py         |  2 +-
 llama_stack/templates/bedrock/doc_template.md |  7 ++
 .../templates/fireworks/doc_template.md       |  7 ++
 .../meta-reference-gpu/doc_template.md        |  7 ++
 .../meta-reference-quantized-gpu/build.yaml   | 18 ++--
 .../doc_template.md                           | 97 +++++++++++++------
 .../meta_reference.py                         | 49 ++--------
 .../meta-reference-quantized-gpu/run.yaml     | 58 +++++++++++
 llama_stack/templates/ollama/doc_template.md  |  7 ++
 .../templates/remote-vllm/doc_template.md     |  6 ++
 llama_stack/templates/tgi/doc_template.md     |  7 ++
 .../templates/together/doc_template.md        |  9 +-
 17 files changed, 306 insertions(+), 115 deletions(-)
 create mode 100644 llama_stack/templates/meta-reference-quantized-gpu/run.yaml

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index e7506537f..36426e862 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -222,6 +222,40 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
+  "meta-reference-quantized-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fbgemm-gpu",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "torch",
+    "torchao==0.5.0",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "ollama": [
     "aiohttp",
     "aiosqlite",
diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md
index 1b88b01cc..8bb9d8fc5 100644
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@@ -1,4 +1,5 @@
 # Bedrock Distribution
+
 ```{toctree}
 :maxdepth: 2
 :hidden:
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index 7dcc642d5..b5b52c1f4 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -7,55 +7,86 @@
 self
 ```
 
-The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
+The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
 
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| inference | `inline::meta-reference-quantized` |
+| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+| safety | `inline::llama-guard` |
+| telemetry | `inline::meta-reference` |
 
-| **API**         	| **Inference**            	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
-|-----------------	|------------------------  	|----------------	|--------------------------------------------------	|----------------	|----------------	|
-| **Provider(s)** 	| meta-reference-quantized  | meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
 
 The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
 
-### Step 0. Prerequisite - Downloading Models
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
+Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
+
+
+## Prerequisite: Downloading Models
+
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints
-Llama3.2-3B-Instruct:int4-qlora-eo8
+Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
+Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
 ```
 
-### Step 1. Start the Distribution
-#### (Option 1) Start with Docker
-```
-$ cd distributions/meta-reference-quantized-gpu && docker compose up
+## Running the Distribution
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-meta-reference-quantized-gpu \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 
-> [!NOTE]
-> This assumes you have access to GPU to start a local server with access to your GPU.
+If you are using Llama Stack Safety / Shield APIs, use:
 
-
-> [!NOTE]
-> `~/.llama` should be the path containing downloaded weights of Llama models.
-
-
-This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
-
-```
-docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
+```bash
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-meta-reference-quantized-gpu \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
 
-#### (Option 2) Start with Conda
+### Via Conda
 
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 
-2. Build the `meta-reference-quantized-gpu` distribution
-
-```
-$ llama stack build --template meta-reference-quantized-gpu --image-type conda
+```bash
+llama stack build --template meta-reference-quantized-gpu --image-type conda
+llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 
-3. Start running distribution
-```
-$ cd distributions/meta-reference-quantized-gpu
-$ llama stack run ./run.yaml
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 235cc1e0f..abebe5929 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -1,5 +1,4 @@
 # Remote vLLM Distribution
-
 ```{toctree}
 :maxdepth: 2
 :hidden:
diff --git a/llama_stack/providers/inline/inference/meta_reference/config.py b/llama_stack/providers/inline/inference/meta_reference/config.py
index 4713e7f99..04058d55d 100644
--- a/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/llama_stack/providers/inline/inference/meta_reference/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from llama_models.datatypes import *  # noqa: F403
 from llama_models.sku_list import resolve_model
@@ -56,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
         cls,
         model: str = "Llama3.2-3B-Instruct",
         checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
+        **kwargs,
     ) -> Dict[str, Any]:
         return {
             "model": model,
@@ -66,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):
 
 class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
     quantization: QuantizationConfig
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        model: str = "Llama3.2-3B-Instruct",
+        checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
+        **kwargs,
+    ) -> Dict[str, Any]:
+        config = super().sample_run_config(model, checkpoint_dir, **kwargs)
+        config["quantization"] = {
+            "type": "fp8",
+        }
+        return config
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index 84bf9af2a..90f0dac93 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -50,7 +50,7 @@ def process_template(template_dir: Path, progress) -> None:
             template.save_distribution(
                 yaml_output_dir=REPO_ROOT / "llama_stack" / "templates" / template.name,
                 doc_output_dir=REPO_ROOT
-                / "docs/source/getting_started/distributions"
+                / "docs/source/distributions"
                 / f"{template.distro_type}_distro",
             )
         else:
diff --git a/llama_stack/templates/bedrock/doc_template.md b/llama_stack/templates/bedrock/doc_template.md
index 9331382b6..2121719b7 100644
--- a/llama_stack/templates/bedrock/doc_template.md
+++ b/llama_stack/templates/bedrock/doc_template.md
@@ -1,5 +1,12 @@
 # Bedrock Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 
 {{ providers_table }}
diff --git a/llama_stack/templates/fireworks/doc_template.md b/llama_stack/templates/fireworks/doc_template.md
index 2f4be574d..1b072d277 100644
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@@ -1,5 +1,12 @@
 # Fireworks Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 
 {{ providers_table }}
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index de09efdb0..66debfb1f 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -1,5 +1,12 @@
 # Meta Reference Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 
 {{ providers_table }}
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
index a22490b5e..961864dac 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
@@ -1,13 +1,19 @@
+version: '2'
 name: meta-reference-quantized-gpu
 distribution_spec:
-  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
-  description: Use code from `llama_stack` itself to serve all llama stack APIs
+  description: Use Meta Reference with fp8, int4 quantization for running LLM inference
+  docker_image: null
   providers:
-    inference: meta-reference-quantized
+    inference:
+    - inline::meta-reference-quantized
     memory:
     - inline::faiss
     - remote::chromadb
     - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
index afe1e3e20..60c64c222 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@@ -1,54 +1,87 @@
 # Meta Reference Quantized Distribution
 
-The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
+```{toctree}
+:maxdepth: 2
+:hidden:
 
+self
+```
 
-| **API**         	| **Inference**            	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
-|-----------------	|------------------------  	|----------------	|--------------------------------------------------	|----------------	|----------------	|
-| **Provider(s)** 	| meta-reference-quantized  | meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
+
+{{ providers_table }}
 
 The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
 
-### Step 0. Prerequisite - Downloading Models
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
+Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+
+## Prerequisite: Downloading Models
+
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints
-Llama3.2-3B-Instruct:int4-qlora-eo8
+Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
+Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
 ```
 
-### Step 1. Start the Distribution
-#### (Option 1) Start with Docker
-```
-$ cd distributions/meta-reference-quantized-gpu && docker compose up
+## Running the Distribution
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 
-> [!NOTE]
-> This assumes you have access to GPU to start a local server with access to your GPU.
+If you are using Llama Stack Safety / Shield APIs, use:
 
-
-> [!NOTE]
-> `~/.llama` should be the path containing downloaded weights of Llama models.
-
-
-This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
-
-```
-docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
+```bash
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
 
-#### (Option 2) Start with Conda
+### Via Conda
 
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 
-2. Build the `meta-reference-quantized-gpu` distribution
-
-```
-$ llama stack build --template meta-reference-quantized-gpu --image-type conda
+```bash
+llama stack build --template {{ name }} --image-type conda
+llama stack run distributions/{{ name }}/run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 
-3. Start running distribution
-```
-$ cd distributions/meta-reference-quantized-gpu
-$ llama stack run ./run.yaml
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+llama stack run distributions/{{ name }}/run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
index f254bc920..1ff5d31d6 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@@ -6,16 +6,16 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.distribution.datatypes import ModelInput, Provider
 from llama_stack.providers.inline.inference.meta_reference import (
-    MetaReferenceInferenceConfig,
+    MetaReferenceQuantizedInferenceConfig,
 )
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["inline::meta-reference"],
+        "inference": ["inline::meta-reference-quantized"],
         "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -24,8 +24,8 @@ def get_distribution_template() -> DistributionTemplate:
 
     inference_provider = Provider(
         provider_id="meta-reference-inference",
-        provider_type="inline::meta-reference",
-        config=MetaReferenceInferenceConfig.sample_run_config(
+        provider_type="inline::meta-reference-quantized",
+        config=MetaReferenceQuantizedInferenceConfig.sample_run_config(
             model="${env.INFERENCE_MODEL}",
             checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
         ),
@@ -35,18 +35,13 @@ def get_distribution_template() -> DistributionTemplate:
         model_id="${env.INFERENCE_MODEL}",
         provider_id="meta-reference-inference",
     )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="meta-reference-safety",
-    )
-
     return DistributionTemplate(
-        name="meta-reference-gpu",
+        name="meta-reference-quantized-gpu",
         distro_type="self_hosted",
-        description="Use Meta Reference for running LLM inference",
+        description="Use Meta Reference with fp8, int4 quantization for running LLM inference",
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
-        default_models=[inference_model, safety_model],
+        default_models=[inference_model],
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
@@ -54,26 +49,6 @@ def get_distribution_template() -> DistributionTemplate:
                 },
                 default_models=[inference_model],
             ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        Provider(
-                            provider_id="meta-reference-safety",
-                            provider_type="inline::meta-reference",
-                            config=MetaReferenceInferenceConfig.sample_run_config(
-                                model="${env.SAFETY_MODEL}",
-                                checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
-                            ),
-                        ),
-                    ],
-                },
-                default_models=[
-                    inference_model,
-                    safety_model,
-                ],
-                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
-            ),
         },
         run_config_env_vars={
             "LLAMASTACK_PORT": (
@@ -88,13 +63,5 @@ def get_distribution_template() -> DistributionTemplate:
                 "null",
                 "Directory containing the Meta Reference model checkpoint",
             ),
-            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
-                "Name of the safety (Llama-Guard) model to use",
-            ),
-            "SAFETY_CHECKPOINT_DIR": (
-                "null",
-                "Directory containing the Llama-Guard model checkpoint",
-            ),
         },
     )
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
new file mode 100644
index 000000000..e1104b623
--- /dev/null
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -0,0 +1,58 @@
+version: '2'
+image_name: meta-reference-quantized-gpu
+docker_image: null
+conda_env: meta-reference-quantized-gpu
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: meta-reference-inference
+    provider_type: inline::meta-reference-quantized
+    config:
+      model: ${env.INFERENCE_MODEL}
+      max_seq_len: 4096
+      checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
+      quantization:
+        type: fp8
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: meta-reference-inference
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index 09fe8eabc..7671ca3cf 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -1,5 +1,12 @@
 # Ollama Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 
 {{ providers_table }}
diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md
index 63432fb70..7614e4f77 100644
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@@ -1,4 +1,10 @@
 # Remote vLLM Distribution
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
 
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 
diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md
index 42124696f..0938e656d 100644
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@@ -1,5 +1,12 @@
 # TGI Distribution
 
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 
 {{ providers_table }}
diff --git a/llama_stack/templates/together/doc_template.md b/llama_stack/templates/together/doc_template.md
index 3fc94dd35..dc150ff09 100644
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@@ -1,4 +1,11 @@
-# Fireworks Distribution
+# Together Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
 
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 

From cf079a22a06238345055be7011db472e1276e6c1 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 20 Nov 2024 23:24:59 -0800
Subject: [PATCH 11/11] Plurals

---
 docs/source/distributions/index.md                      | 6 +++---
 docs/source/distributions/ondevice_distro/index.md      | 2 +-
 docs/source/distributions/remote_hosted_distro/index.md | 2 +-
 docs/source/distributions/self_hosted_distro/index.md   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md
index 753555d5b..bedc9706e 100644
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@@ -46,9 +46,9 @@ If so, we suggest:
 
 Please see our pages in detail for the types of distributions we offer:
 
-1. [Self-Hosted Distribution](./self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
-2. [Remote-Hosted Distribution](./remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
-3. [On-device Distribution](./ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
+1. [Self-Hosted Distributions](./self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
+2. [Remote-Hosted Distributions](./remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
+3. [On-device Distributions](./ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
 
 ## Building Your Own Distribution
 
diff --git a/docs/source/distributions/ondevice_distro/index.md b/docs/source/distributions/ondevice_distro/index.md
index b3228455d..d615e70ed 100644
--- a/docs/source/distributions/ondevice_distro/index.md
+++ b/docs/source/distributions/ondevice_distro/index.md
@@ -1,4 +1,4 @@
-# On-Device Distribution
+# On-Device Distributions
 
 On-device distributions are Llama Stack distributions that run locally on your iOS / Android device.
 
diff --git a/docs/source/distributions/remote_hosted_distro/index.md b/docs/source/distributions/remote_hosted_distro/index.md
index 308d29fa1..d2c9282fc 100644
--- a/docs/source/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
@@ -1,4 +1,4 @@
-# Remote-Hosted Distribution
+# Remote-Hosted Distributions
 
 ```{toctree}
 :maxdepth: 2
diff --git a/docs/source/distributions/self_hosted_distro/index.md b/docs/source/distributions/self_hosted_distro/index.md
index fb775fb52..53a3c7b20 100644
--- a/docs/source/distributions/self_hosted_distro/index.md
+++ b/docs/source/distributions/self_hosted_distro/index.md
@@ -1,4 +1,4 @@
-# Self-Hosted Distribution
+# Self-Hosted Distributions
 
 ```{toctree}
 :maxdepth: 2