From 0e4307de0f4fa531ac382654a082b4bc5ba3b7b1 Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Fri, 25 Apr 2025 20:17:31 +0100
Subject: [PATCH] docs: Fix missing --gpu all flag in Docker run commands
 (#2026)

adding the --gpu all flag to Docker run commands
for meta-reference-gpu distributions ensures models are loaded into GPU
instead of CPU.

Remove docs for meta-reference-quantized-gpu
The distribution was removed in #1887
but these files were left behind.


Fixes: #1798

# What does this PR do?
Fixes doc to add --gpu all command to docker run

[//]: # (If resolving an issue, uncomment and update the line below)
Closes #1798

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

verified in docker documentation but untested

---------

Signed-off-by: Derek Higgins <derekh@redhat.com>
---
 README.md                                     |   1 -
 docs/source/distributions/building_distro.md  |   2 -
 .../self_hosted_distro/meta-reference-gpu.md  |   2 +
 .../meta-reference-quantized-gpu.md           | 123 ------------------
 .../meta-reference-gpu/doc_template.md        |   2 +
 5 files changed, 4 insertions(+), 126 deletions(-)
 delete mode 100644 docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md

diff --git a/README.md b/README.md
index c2e688763..9a4f1a849 100644
--- a/README.md
+++ b/README.md
@@ -129,7 +129,6 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
-|           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 4c342b14b..56b8d30a8 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -109,8 +109,6 @@ llama stack build --list-templates
 +------------------------------+-----------------------------------------------------------------------------+
 | nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
 +------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference    |
-+------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | ollama                       | Use (an external) Ollama server for running LLM inference                   |
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index b90f75347..f58d7bbee 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -81,6 +81,7 @@ LLAMA_STACK_PORT=8321
 docker run \
   -it \
   --pull always \
+  --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-gpu \
@@ -94,6 +95,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
   -it \
   --pull always \
+  --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-gpu \
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
deleted file mode 100644
index c3e2b4f2c..000000000
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ /dev/null
@@ -1,123 +0,0 @@
----
-orphan: true
----
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Meta Reference Quantized Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `inline::meta-reference-quantized` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
-
-Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
-
-### Environment Variables
-
-The following environment variables can be configured:
-
-- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
-- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
-- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
-
-
-## Prerequisite: Downloading Models
-
-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-
-```
-$ llama model list --downloaded
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
-```
-
-## Running the Distribution
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  llamastack/distribution-meta-reference-quantized-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  llamastack/distribution-meta-reference-quantized-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-```
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-llama stack build --template meta-reference-quantized-gpu --image-type conda
-llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-```
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index a174331b4..2ca6793d7 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -69,6 +69,7 @@ LLAMA_STACK_PORT=8321
 docker run \
   -it \
   --pull always \
+  --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \
@@ -82,6 +83,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
   -it \
   --pull always \
+  --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \