From 7ee0ee78437d3afb77cdd559ca395924061e1f5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 10 Oct 2025 01:50:33 +0200
Subject: [PATCH] chore!: remove model mgmt from CLI for Hugging Face CLI
 (#3700)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change removes the `llama model` and `llama download` subcommands
from the CLI, replacing them with recommendations to use the Hugging
Face CLI instead.

Rationale for this change:
- The model management functionality was largely duplicating what
Hugging Face CLI already provides, leading to unnecessary maintenance
overhead (except the download source from Meta?)
- Maintaining our own implementation required fixing bugs and keeping up
with changes in model repositories and download mechanisms
- The Hugging Face CLI is more mature, widely adopted, and better
maintained
- This allows us to focus on the core Llama Stack functionality rather
than reimplementing model management tools

Changes made:
- Removed all model-related CLI commands and their implementations
- Updated documentation to recommend using `huggingface-cli` for model
downloads
- Removed Meta-specific download logic and statements
- Simplified the CLI to focus solely on stack management operations

Users should now use:
- `huggingface-cli download` for downloading models
- `huggingface-cli scan-cache` for listing downloaded models

This is a breaking change as it removes previously available CLI
commands.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/python-build-test.yml       |   2 -
 README.md                                     |   2 +-
 .../self_hosted_distro/meta-reference-gpu.md  |  26 +-
 .../llama_cli_reference/download_models.md    | 145 +----
 .../references/llama_cli_reference/index.md   | 238 +--------
 docs/getting_started_llama4.ipynb             |   4 +-
 llama_stack/cli/download.py                   | 495 ------------------
 llama_stack/cli/llama.py                      |   6 -
 llama_stack/cli/model/__init__.py             |   7 -
 llama_stack/cli/model/describe.py             |  70 ---
 llama_stack/cli/model/download.py             |  24 -
 llama_stack/cli/model/list.py                 | 119 -----
 llama_stack/cli/model/model.py                |  43 --
 llama_stack/cli/model/prompt_format.py        | 133 -----
 llama_stack/cli/model/remove.py               |  68 ---
 llama_stack/cli/model/safety_models.py        |  64 ---
 llama_stack/cli/model/verify_download.py      |  24 -
 llama_stack/cli/verify_download.py            | 141 -----
 .../meta-reference-gpu/doc_template.md        |  26 +-
 pyproject.toml                                |  34 +-
 uv.lock                                       |   4 +-
 21 files changed, 63 insertions(+), 1612 deletions(-)
 delete mode 100644 llama_stack/cli/download.py
 delete mode 100644 llama_stack/cli/model/__init__.py
 delete mode 100644 llama_stack/cli/model/describe.py
 delete mode 100644 llama_stack/cli/model/download.py
 delete mode 100644 llama_stack/cli/model/list.py
 delete mode 100644 llama_stack/cli/model/model.py
 delete mode 100644 llama_stack/cli/model/prompt_format.py
 delete mode 100644 llama_stack/cli/model/remove.py
 delete mode 100644 llama_stack/cli/model/safety_models.py
 delete mode 100644 llama_stack/cli/model/verify_download.py
 delete mode 100644 llama_stack/cli/verify_download.py

diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml
index fca7c4b4c..c6c443811 100644
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@@ -43,7 +43,5 @@ jobs:
         uv pip list
         uv pip show llama-stack
         command -v llama
-        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
-        llama model list
         llama stack list-apis
         llama stack list-providers inference
diff --git a/README.md b/README.md
index 9cb9e32fc..75e9989d7 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ pip install -U llama_stack
 
 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
+huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
 
 # start a llama stack server
 INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
diff --git a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
index 1c0ef5f6e..403a31667 100644
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -41,31 +41,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-
-```
-$ llama model list --downloaded
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```
 
 ## Running the Distribution
diff --git a/docs/docs/references/llama_cli_reference/download_models.md b/docs/docs/references/llama_cli_reference/download_models.md
index a9af65349..542740202 100644
--- a/docs/docs/references/llama_cli_reference/download_models.md
+++ b/docs/docs/references/llama_cli_reference/download_models.md
@@ -25,141 +25,42 @@ You have two ways to install Llama Stack:
     cd llama-stack
     pip install -e .
 
-## Downloading models via CLI
+## Downloading models via Hugging Face CLI
 
-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.
 
-To download any model you need the **Model Descriptor**.
-This can be obtained by running the command
-```
-llama model list
-```
+### Install Hugging Face CLI
 
-You should see a table like this:
-
-```
-+----------------------------------+------------------------------------------+----------------+
-| Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
-+----------------------------------+------------------------------------------+----------------+
-```
-
-To download models, you can use the llama download command.
-
-#### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
-
-Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/). Note: You need to quote the META_URL
-
-Download the required checkpoints using the following commands:
+First, install the Hugging Face CLI:
 ```bash
-# download the 8B model, this can be run on a single GPU
-llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url 'META_URL'
-
-# you can also get the 70B model, this will require 8 GPUs however
-llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url 'META_URL'
-
-# llama-agents have safety enabled by default. For this, you will need
-# safety models -- Llama-Guard and Prompt-Guard
-llama download --source meta --model-id Prompt-Guard-86M --meta-url 'META_URL'
-llama download --source meta --model-id Llama-Guard-3-1B --meta-url 'META_URL'
+pip install huggingface_hub[cli]
 ```
 
-#### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
+### Download models from Hugging Face
 
-Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
+You can download models using the `huggingface-cli download` command. Here are some examples:
 
 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 3B Instruct model
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct
 
-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 1B Instruct model
+huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --local-dir ~/.llama/Llama-3.2-1B-Instruct
 
-llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
-llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
-```
-
-**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-
-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+# Download Llama Guard 3 1B model
+huggingface-cli download meta-llama/Llama-Guard-3-1B --local-dir ~/.llama/Llama-Guard-3-1B
+
+# Download Prompt Guard model
+huggingface-cli download meta-llama/Prompt-Guard-86M --local-dir ~/.llama/Prompt-Guard-86M
 ```
 
+**Important:** You need to authenticate with Hugging Face to download models. You can do this by:
+1. Getting your token from [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+2. Running `huggingface-cli login` and entering your token
 ## List the downloaded models
 
-To list the downloaded models with the following command:
-```
-llama model list --downloaded
-```
-
-You should see a table like this:
-```
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+To list the downloaded models, you can use the Hugging Face CLI:
+```bash
+# List all downloaded models in your local cache
+huggingface-cli scan-cache
 ```
diff --git a/docs/docs/references/llama_cli_reference/index.md b/docs/docs/references/llama_cli_reference/index.md
index 9b71a6795..0bebc601d 100644
--- a/docs/docs/references/llama_cli_reference/index.md
+++ b/docs/docs/references/llama_cli_reference/index.md
@@ -27,9 +27,9 @@ You have two ways to install Llama Stack:
 
 
 ## `llama` subcommands
-1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
-2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
-3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
+1. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
+
+For downloading models, we recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli). See [Downloading models](#downloading-models) for more information.
 
 ### Sample Usage
 
@@ -38,239 +38,41 @@ llama --help
 ```
 
 ```
-usage: llama [-h] {download,model,stack} ...
+usage: llama [-h] {stack} ...
 
 Welcome to the Llama CLI
 
 options:
-  -h, --help            show this help message and exit
+  -h, --help  show this help message and exit
 
 subcommands:
-  {download,model,stack}
+  {stack}
+
+  stack                 Operations for the Llama Stack / Distributions
 ```
 
 ## Downloading models
 
-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.
 
-To download any model you need the **Model Descriptor**.
-This can be obtained by running the command
-```
-llama model list
-```
-
-You should see a table like this:
-
-```
-+----------------------------------+------------------------------------------+----------------+
-| Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
-+----------------------------------+------------------------------------------+----------------+
-```
-
-To download models, you can use the `llama download` command.
-
-### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
-
-Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
-
-Download the required checkpoints using the following commands:
+First, install the Hugging Face CLI:
 ```bash
-# download the 8B model, this can be run on a single GPU
-llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url META_URL
-
-# you can also get the 70B model, this will require 8 GPUs however
-llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url META_URL
-
-# llama-agents have safety enabled by default. For this, you will need
-# safety models -- Llama-Guard and Prompt-Guard
-llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
-llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
+pip install huggingface_hub[cli]
 ```
 
-### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
-
-Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
-
+Then authenticate and download models:
 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Authenticate with Hugging Face
+huggingface-cli login
 
-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
-
-llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
-llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
-```
-
-**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-
-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+# Download a model
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct
 ```
 
 ## List the downloaded models
 
-To list the downloaded models with the following command:
-```
-llama model list --downloaded
-```
-
-You should see a table like this:
-```
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
-```
-
-
-## Understand the models
-The `llama model` command helps you explore the model’s interface.
-
-1. `download`: Download the model from different sources. (meta, huggingface)
-2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
-3. `prompt-format`: Show llama model message formats.
-4. `describe`: Describes all the properties of the model.
-
-### Sample Usage
-
-`llama model <subcommand> <options>`
-
-```
-llama model --help
-```
-```
-usage: llama model [-h] {download,list,prompt-format,describe,verify-download,remove} ...
-
-Work with llama models
-
-options:
-  -h, --help            show this help message and exit
-
-model_subcommands:
-  {download,list,prompt-format,describe,verify-download,remove}
-```
-
-### Describe
-
-You can use the describe command to know more about a model:
-```
-llama model describe -m Llama3.2-3B-Instruct
-```
-```
-+-----------------------------+----------------------------------+
-| Model                       | Llama3.2-3B-Instruct             |
-+-----------------------------+----------------------------------+
-| Hugging Face ID             | meta-llama/Llama-3.2-3B-Instruct |
-+-----------------------------+----------------------------------+
-| Description                 | Llama 3.2 3b instruct model      |
-+-----------------------------+----------------------------------+
-| Context Length              | 128K tokens                      |
-+-----------------------------+----------------------------------+
-| Weights format              | bf16                             |
-+-----------------------------+----------------------------------+
-| Model params.json           | {                                |
-|                             |     "dim": 3072,                 |
-|                             |     "n_layers": 28,              |
-|                             |     "n_heads": 24,               |
-|                             |     "n_kv_heads": 8,             |
-|                             |     "vocab_size": 128256,        |
-|                             |     "ffn_dim_multiplier": 1.0,   |
-|                             |     "multiple_of": 256,          |
-|                             |     "norm_eps": 1e-05,           |
-|                             |     "rope_theta": 500000.0,      |
-|                             |     "use_scaled_rope": true      |
-|                             | }                                |
-+-----------------------------+----------------------------------+
-| Recommended sampling params | {                                |
-|                             |     "temperature": 1.0,          |
-|                             |     "top_p": 0.9,                |
-|                             |     "top_k": 0                   |
-|                             | }                                |
-+-----------------------------+----------------------------------+
-```
-
-### Prompt Format
-You can even run `llama model prompt-format` see all of the templates and their tokens:
-
-```
-llama model prompt-format -m Llama3.2-3B-Instruct
-```
-![alt text](/img/prompt-format.png)
-
-
-You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
-
-**NOTE**: Outputs in terminal are color printed to show special tokens.
-
-### Remove model
-You can run `llama model remove` to remove an unnecessary model:
-
-```
-llama model remove -m Llama-Guard-3-8B-int8
+To list the downloaded models, you can use the Hugging Face CLI:
+```bash
+# List all downloaded models in your local cache
+huggingface-cli scan-cache
 ```
diff --git a/docs/getting_started_llama4.ipynb b/docs/getting_started_llama4.ipynb
index bca505b5e..0ec9aa0e6 100644
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@@ -51,11 +51,11 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install uv\n",
+        "!pip install uv \"huggingface_hub[cli]\"\n",
         "\n",
         "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
         "# get meta url from llama.com\n",
-        "!uv run --with llama-stack llama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
+        "huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL\n",
         "\n",
         "model_id = f\"meta-llama/{MODEL}\""
       ]
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
deleted file mode 100644
index 70cb9f4db..000000000
--- a/llama_stack/cli/download.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import asyncio
-import json
-import os
-import shutil
-import sys
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from functools import partial
-from pathlib import Path
-
-import httpx
-from pydantic import BaseModel, ConfigDict
-from rich.console import Console
-from rich.progress import (
-    BarColumn,
-    DownloadColumn,
-    Progress,
-    TextColumn,
-    TimeRemainingColumn,
-    TransferSpeedColumn,
-)
-from termcolor import cprint
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.models.llama.sku_list import LlamaDownloadInfo
-from llama_stack.models.llama.sku_types import Model
-
-
-class Download(Subcommand):
-    """Llama cli for downloading llama toolchain assets"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "download",
-            prog="llama download",
-            description="Download a model from llama.meta.com or Hugging Face Hub",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        setup_download_parser(self.parser)
-
-
-def setup_download_parser(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--source",
-        choices=["meta", "huggingface"],
-        default="meta",
-    )
-    parser.add_argument(
-        "--model-id",
-        required=False,
-        help="See `llama model list` or `llama model list --show-all` for the list of available models. Specify multiple model IDs with commas, e.g. --model-id Llama3.2-1B,Llama3.2-3B",
-    )
-    parser.add_argument(
-        "--hf-token",
-        type=str,
-        required=False,
-        default=None,
-        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
-    )
-    parser.add_argument(
-        "--meta-url",
-        type=str,
-        required=False,
-        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
-    )
-    parser.add_argument(
-        "--max-parallel",
-        type=int,
-        required=False,
-        default=3,
-        help="Maximum number of concurrent downloads",
-    )
-    parser.add_argument(
-        "--ignore-patterns",
-        type=str,
-        required=False,
-        default="*.safetensors",
-        help="""For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
-safetensors files to avoid downloading duplicate weights.
-""",
-    )
-    parser.add_argument(
-        "--manifest-file",
-        type=str,
-        help="For source=meta, you can download models from a manifest file containing a file => URL mapping",
-        required=False,
-    )
-    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
-
-
-@dataclass
-class DownloadTask:
-    url: str
-    output_file: str
-    total_size: int = 0
-    downloaded_size: int = 0
-    task_id: int | None = None
-    retries: int = 0
-    max_retries: int = 3
-
-
-class DownloadError(Exception):
-    pass
-
-
-class CustomTransferSpeedColumn(TransferSpeedColumn):
-    def render(self, task):
-        if task.finished:
-            return "-"
-        return super().render(task)
-
-
-class ParallelDownloader:
-    def __init__(
-        self,
-        max_concurrent_downloads: int = 3,
-        buffer_size: int = 1024 * 1024,
-        timeout: int = 30,
-    ):
-        self.max_concurrent_downloads = max_concurrent_downloads
-        self.buffer_size = buffer_size
-        self.timeout = timeout
-        self.console = Console()
-        self.progress = Progress(
-            TextColumn("[bold blue]{task.description}"),
-            BarColumn(bar_width=40),
-            "[progress.percentage]{task.percentage:>3.1f}%",
-            DownloadColumn(),
-            CustomTransferSpeedColumn(),
-            TimeRemainingColumn(),
-            console=self.console,
-            expand=True,
-        )
-        self.client_options = {
-            "timeout": httpx.Timeout(timeout),
-            "follow_redirects": True,
-        }
-
-    async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
-        last_exception = None
-        for attempt in range(task.max_retries):
-            try:
-                return await func(*args, **kwargs)
-            except Exception as e:
-                last_exception = e
-                if attempt < task.max_retries - 1:
-                    wait_time = min(30, 2**attempt)  # Cap at 30 seconds
-                    self.console.print(
-                        f"[yellow]Attempt {attempt + 1}/{task.max_retries} failed, "
-                        f"retrying in {wait_time} seconds: {str(e)}[/yellow]"
-                    )
-                    await asyncio.sleep(wait_time)
-                    continue
-        raise last_exception
-
-    async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
-        if task.total_size > 0:
-            self.progress.update(task.task_id, total=task.total_size)
-            return
-
-        async def _get_info():
-            response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
-            response.raise_for_status()
-            return response
-
-        try:
-            response = await self.retry_with_exponential_backoff(task, _get_info)
-
-            task.url = str(response.url)
-            task.total_size = int(response.headers.get("Content-Length", 0))
-
-            if task.total_size == 0:
-                raise DownloadError(
-                    f"Unable to determine file size for {task.output_file}. "
-                    "The server might not support range requests."
-                )
-
-            # Update the progress bar's total size once we know it
-            if task.task_id is not None:
-                self.progress.update(task.task_id, total=task.total_size)
-
-        except httpx.HTTPError as e:
-            self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
-            raise
-
-    def verify_file_integrity(self, task: DownloadTask) -> bool:
-        if not os.path.exists(task.output_file):
-            return False
-        return os.path.getsize(task.output_file) == task.total_size
-
-    async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
-        async def _download_chunk():
-            headers = {"Range": f"bytes={start}-{end}"}
-            async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
-                response.raise_for_status()
-
-                with open(task.output_file, "ab") as file:
-                    file.seek(start)
-                    async for chunk in response.aiter_bytes(self.buffer_size):
-                        file.write(chunk)
-                        task.downloaded_size += len(chunk)
-                        self.progress.update(
-                            task.task_id,
-                            completed=task.downloaded_size,
-                        )
-
-        try:
-            await self.retry_with_exponential_backoff(task, _download_chunk)
-        except Exception as e:
-            raise DownloadError(
-                f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
-            ) from e
-
-    async def prepare_download(self, task: DownloadTask) -> None:
-        output_dir = os.path.dirname(task.output_file)
-        os.makedirs(output_dir, exist_ok=True)
-
-        if os.path.exists(task.output_file):
-            task.downloaded_size = os.path.getsize(task.output_file)
-
-    async def download_file(self, task: DownloadTask) -> None:
-        try:
-            async with httpx.AsyncClient(**self.client_options) as client:
-                await self.get_file_info(client, task)
-
-                # Check if file is already downloaded
-                if os.path.exists(task.output_file):
-                    if self.verify_file_integrity(task):
-                        self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.task_id, completed=task.total_size)
-                        return
-
-                await self.prepare_download(task)
-
-                try:
-                    # Split the remaining download into chunks
-                    chunk_size = 27_000_000_000  # Cloudfront max chunk size
-                    chunks = []
-
-                    current_pos = task.downloaded_size
-                    while current_pos < task.total_size:
-                        chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
-                        chunks.append((current_pos, chunk_end))
-                        current_pos = chunk_end + 1
-
-                    # Download chunks in sequence
-                    for chunk_start, chunk_end in chunks:
-                        await self.download_chunk(client, task, chunk_start, chunk_end)
-
-                except Exception as e:
-                    raise DownloadError(f"Download failed: {str(e)}") from e
-
-        except Exception as e:
-            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
-            raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
-
-    def has_disk_space(self, tasks: list[DownloadTask]) -> bool:
-        try:
-            total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
-            dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
-            free_space = shutil.disk_usage(dir_path).free
-
-            # Add 10% buffer for safety
-            required_space = int(total_remaining_size * 1.1)
-
-            if free_space < required_space:
-                self.console.print(
-                    f"[red]Not enough disk space. Required: {required_space // (1024 * 1024)} MB, "
-                    f"Available: {free_space // (1024 * 1024)} MB[/red]"
-                )
-                return False
-            return True
-
-        except Exception as e:
-            raise DownloadError(f"Failed to check disk space: {str(e)}") from e
-
-    async def download_all(self, tasks: list[DownloadTask]) -> None:
-        if not tasks:
-            raise ValueError("No download tasks provided")
-
-        if not os.environ.get("LLAMA_DOWNLOAD_NO_SPACE_CHECK") and not self.has_disk_space(tasks):
-            raise DownloadError("Insufficient disk space for downloads")
-
-        failed_tasks = []
-
-        with self.progress:
-            for task in tasks:
-                desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
-
-            semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
-
-            async def download_with_semaphore(task: DownloadTask):
-                async with semaphore:
-                    try:
-                        await self.download_file(task)
-                    except Exception as e:
-                        failed_tasks.append((task, str(e)))
-
-            await asyncio.gather(*(download_with_semaphore(task) for task in tasks))
-
-        if failed_tasks:
-            self.console.print("\n[red]Some downloads failed:[/red]")
-            for task, error in failed_tasks:
-                self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
-            raise DownloadError(f"{len(failed_tasks)} downloads failed")
-
-
-def _hf_download(
-    model: "Model",
-    hf_token: str,
-    ignore_patterns: str,
-    parser: argparse.ArgumentParser,
-):
-    from huggingface_hub import snapshot_download
-    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
-
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    repo_id = model.huggingface_repo
-    if repo_id is None:
-        raise ValueError(f"No repo id found for model {model.descriptor()}")
-
-    output_dir = model_local_dir(model.descriptor())
-    os.makedirs(output_dir, exist_ok=True)
-    try:
-        true_output_dir = snapshot_download(
-            repo_id,
-            local_dir=output_dir,
-            ignore_patterns=ignore_patterns,
-            token=hf_token,
-            library_name="llama-stack",
-        )
-    except GatedRepoError:
-        parser.error(
-            "It looks like you are trying to access a gated repository. Please ensure you "
-            "have access to the repository and have provided the proper Hugging Face API token "
-            "using the option `--hf-token` or by running `huggingface-cli login`."
-            "You can find your token by visiting https://huggingface.co/settings/tokens"
-        )
-    except RepositoryNotFoundError:
-        parser.error(f"Repository '{repo_id}' not found on the Hugging Face Hub or incorrect Hugging Face token.")
-    except Exception as e:
-        parser.error(e)
-
-    print(f"\nSuccessfully downloaded model to {true_output_dir}")
-
-
-def _meta_download(
-    model: "Model",
-    model_id: str,
-    meta_url: str,
-    info: "LlamaDownloadInfo",
-    max_concurrent_downloads: int,
-):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    output_dir = Path(model_local_dir(model.descriptor()))
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Create download tasks for each file
-    tasks = []
-    for f in info.files:
-        output_file = str(output_dir / f)
-        url = meta_url.replace("*", f"{info.folder}/{f}")
-        total_size = info.pth_size if "consolidated" in f else 0
-        tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))
-
-    # Initialize and run parallel downloader
-    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
-    asyncio.run(downloader.download_all(tasks))
-
-    cprint(f"\nSuccessfully downloaded model to {output_dir}", color="green", file=sys.stderr)
-    cprint(
-        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
-        file=sys.stderr,
-    )
-    cprint(
-        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
-        color="yellow",
-        file=sys.stderr,
-    )
-
-
-class ModelEntry(BaseModel):
-    model_id: str
-    files: dict[str, str]
-
-    model_config = ConfigDict(protected_namespaces=())
-
-
-class Manifest(BaseModel):
-    models: list[ModelEntry]
-    expires_on: datetime
-
-
-def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    with open(manifest_file) as f:
-        d = json.load(f)
-        manifest = Manifest(**d)
-
-    if datetime.now(UTC) > manifest.expires_on.astimezone(UTC):
-        raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
-
-    console = Console()
-    for entry in manifest.models:
-        console.print(f"[blue]Downloading model {entry.model_id}...[/blue]")
-        output_dir = Path(model_local_dir(entry.model_id))
-        os.makedirs(output_dir, exist_ok=True)
-
-        if any(output_dir.iterdir()):
-            console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")
-
-            while True:
-                resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
-                if resp.lower() in ["restart", "r"]:
-                    shutil.rmtree(output_dir)
-                    os.makedirs(output_dir, exist_ok=True)
-                    break
-                elif resp.lower() in ["continue", "c"]:
-                    console.print("[blue]Continuing download...[/blue]")
-                    break
-                else:
-                    console.print("[red]Invalid response. Please try again.[/red]")
-
-        # Create download tasks for all files in the manifest
-        tasks = [
-            DownloadTask(url=url, output_file=str(output_dir / fname), max_retries=3)
-            for fname, url in entry.files.items()
-        ]
-
-        # Initialize and run parallel downloader
-        downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
-        asyncio.run(downloader.download_all(tasks))
-
-
-def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
-    """Main download command handler"""
-    try:
-        if args.manifest_file:
-            _download_from_manifest(args.manifest_file, args.max_parallel)
-            return
-
-        if args.model_id is None:
-            parser.error("Please provide a model id")
-            return
-
-        # Handle comma-separated model IDs
-        model_ids = [model_id.strip() for model_id in args.model_id.split(",")]
-
-        from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model
-
-        from .model.safety_models import (
-            prompt_guard_download_info_map,
-            prompt_guard_model_sku_map,
-        )
-
-        prompt_guard_model_sku_map = prompt_guard_model_sku_map()
-        prompt_guard_download_info_map = prompt_guard_download_info_map()
-
-        for model_id in model_ids:
-            if model_id in prompt_guard_model_sku_map.keys():
-                model = prompt_guard_model_sku_map[model_id]
-                info = prompt_guard_download_info_map[model_id]
-            else:
-                model = resolve_model(model_id)
-                if model is None:
-                    parser.error(f"Model {model_id} not found")
-                    continue
-                info = llama_meta_net_info(model)
-
-            if args.source == "huggingface":
-                _hf_download(model, args.hf_token, args.ignore_patterns, parser)
-            else:
-                meta_url = args.meta_url or input(
-                    f"Please provide the signed URL for model {model_id} you received via email "
-                    f"after visiting https://www.llama.com/llama-downloads/ "
-                    f"(e.g., https://llama3-1.llamameta.net/*?Policy...): "
-                )
-                if "llamameta.net" not in meta_url:
-                    parser.error("Invalid Meta URL provided")
-                _meta_download(model, model_id, meta_url, info, args.max_parallel)
-
-    except Exception as e:
-        parser.error(f"Download failed: {str(e)}")
diff --git a/llama_stack/cli/llama.py b/llama_stack/cli/llama.py
index 433b311e7..5ff15d8d7 100644
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@@ -6,11 +6,8 @@
 
 import argparse
 
-from .download import Download
-from .model import ModelParser
 from .stack import StackParser
 from .stack.utils import print_subcommand_description
-from .verify_download import VerifyDownload
 
 
 class LlamaCLIParser:
@@ -30,10 +27,7 @@ class LlamaCLIParser:
         subparsers = self.parser.add_subparsers(title="subcommands")
 
         # Add sub-commands
-        ModelParser.create(subparsers)
         StackParser.create(subparsers)
-        Download.create(subparsers)
-        VerifyDownload.create(subparsers)
 
         print_subcommand_description(self.parser, subparsers)
 
diff --git a/llama_stack/cli/model/__init__.py b/llama_stack/cli/model/__init__.py
deleted file mode 100644
index db70364a9..000000000
--- a/llama_stack/cli/model/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .model import ModelParser  # noqa
diff --git a/llama_stack/cli/model/describe.py b/llama_stack/cli/model/describe.py
deleted file mode 100644
index 26b0da686..000000000
--- a/llama_stack/cli/model/describe.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import json
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.models.llama.sku_list import resolve_model
-
-
-class ModelDescribe(Subcommand):
-    """Show details about a model"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "describe",
-            prog="llama model describe",
-            description="Show details about a llama model",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_describe_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model-id",
-            type=str,
-            required=True,
-            help="See `llama model list` or `llama model list --show-all` for the list of available models",
-        )
-
-    def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku_map
-
-        prompt_guard_model_map = prompt_guard_model_sku_map()
-        if args.model_id in prompt_guard_model_map.keys():
-            model = prompt_guard_model_map[args.model_id]
-        else:
-            model = resolve_model(args.model_id)
-
-        if model is None:
-            self.parser.error(
-                f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
-            )
-            return
-
-        headers = [
-            "Model",
-            model.descriptor(),
-        ]
-
-        rows = [
-            ("Hugging Face ID", model.huggingface_repo or "<Not Available>"),
-            ("Description", model.description),
-            ("Context Length", f"{model.max_seq_length // 1024}K tokens"),
-            ("Weights format", model.quantization_format.value),
-            ("Model params.json", json.dumps(model.arch_args, indent=4)),
-        ]
-
-        print_table(
-            rows,
-            headers,
-            separate_rows=True,
-        )
diff --git a/llama_stack/cli/model/download.py b/llama_stack/cli/model/download.py
deleted file mode 100644
index a3b8f7796..000000000
--- a/llama_stack/cli/model/download.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelDownload(Subcommand):
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "download",
-            prog="llama model download",
-            description="Download a model from llama.meta.com or Hugging Face Hub",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        from llama_stack.cli.download import setup_download_parser
-
-        setup_download_parser(self.parser)
diff --git a/llama_stack/cli/model/list.py b/llama_stack/cli/model/list.py
deleted file mode 100644
index f46a8c88d..000000000
--- a/llama_stack/cli/model/list.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import os
-import time
-from pathlib import Path
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
-from llama_stack.models.llama.sku_list import all_registered_models
-
-
-def _get_model_size(model_dir):
-    return sum(f.stat().st_size for f in Path(model_dir).rglob("*") if f.is_file())
-
-
-def _convert_to_model_descriptor(model):
-    for m in all_registered_models():
-        if model == m.descriptor().replace(":", "-"):
-            return str(m.descriptor())
-    return str(model)
-
-
-def _run_model_list_downloaded_cmd() -> None:
-    headers = ["Model", "Size", "Modified Time"]
-
-    rows = []
-    for model in os.listdir(DEFAULT_CHECKPOINT_DIR):
-        abs_path = os.path.join(DEFAULT_CHECKPOINT_DIR, model)
-        space_usage = _get_model_size(abs_path)
-        model_size = f"{space_usage / (1024**3):.2f} GB"
-        modified_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(abs_path)))
-        rows.append(
-            [
-                _convert_to_model_descriptor(model),
-                model_size,
-                modified_time,
-            ]
-        )
-
-    print_table(
-        rows,
-        headers,
-        separate_rows=True,
-    )
-
-
-class ModelList(Subcommand):
-    """List available llama models"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "list",
-            prog="llama model list",
-            description="Show available llama models",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_list_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "--show-all",
-            action="store_true",
-            help="Show all models (not just defaults)",
-        )
-        self.parser.add_argument(
-            "--downloaded",
-            action="store_true",
-            help="List the downloaded models",
-        )
-        self.parser.add_argument(
-            "-s",
-            "--search",
-            type=str,
-            required=False,
-            help="Search for the input string as a substring in the model descriptor(ID)",
-        )
-
-    def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_skus
-
-        if args.downloaded:
-            return _run_model_list_downloaded_cmd()
-
-        headers = [
-            "Model Descriptor(ID)",
-            "Hugging Face Repo",
-            "Context Length",
-        ]
-
-        rows = []
-        for model in all_registered_models() + prompt_guard_model_skus():
-            if not args.show_all and not model.is_featured:
-                continue
-
-            descriptor = model.descriptor()
-            if not args.search or args.search.lower() in descriptor.lower():
-                rows.append(
-                    [
-                        descriptor,
-                        model.huggingface_repo,
-                        f"{model.max_seq_length // 1024}K",
-                    ]
-                )
-        if len(rows) == 0:
-            print(f"Did not find any model matching `{args.search}`.")
-        else:
-            print_table(
-                rows,
-                headers,
-                separate_rows=True,
-            )
diff --git a/llama_stack/cli/model/model.py b/llama_stack/cli/model/model.py
deleted file mode 100644
index 808029945..000000000
--- a/llama_stack/cli/model/model.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.model.describe import ModelDescribe
-from llama_stack.cli.model.download import ModelDownload
-from llama_stack.cli.model.list import ModelList
-from llama_stack.cli.model.prompt_format import ModelPromptFormat
-from llama_stack.cli.model.remove import ModelRemove
-from llama_stack.cli.model.verify_download import ModelVerifyDownload
-from llama_stack.cli.stack.utils import print_subcommand_description
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelParser(Subcommand):
-    """Llama cli for model interface apis"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "model",
-            prog="llama model",
-            description="Work with llama models",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        self.parser.set_defaults(func=lambda args: self.parser.print_help())
-
-        subparsers = self.parser.add_subparsers(title="model_subcommands")
-
-        # Add sub-commands
-        ModelDownload.create(subparsers)
-        ModelList.create(subparsers)
-        ModelPromptFormat.create(subparsers)
-        ModelDescribe.create(subparsers)
-        ModelVerifyDownload.create(subparsers)
-        ModelRemove.create(subparsers)
-
-        print_subcommand_description(self.parser, subparsers)
diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
deleted file mode 100644
index 673487812..000000000
--- a/llama_stack/cli/model/prompt_format.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import textwrap
-from io import StringIO
-from pathlib import Path
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.models.llama.sku_types import CoreModelId, ModelFamily, is_multimodal, model_family
-
-ROOT_DIR = Path(__file__).parent.parent.parent
-
-
-class ModelPromptFormat(Subcommand):
-    """Llama model cli for describe a model prompt format (message formats)"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "prompt-format",
-            prog="llama model prompt-format",
-            description="Show llama model message formats",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama model prompt-format <options>
-                """
-            ),
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_template_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model-name",
-            type=str,
-            help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n"
-            "(Run `llama model list` to see a list of valid model names)",
-        )
-        self.parser.add_argument(
-            "-l",
-            "--list",
-            action="store_true",
-            help="List all available models",
-        )
-
-    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
-        import importlib.resources
-
-        # Only Llama 3.1 and 3.2 are supported
-        supported_model_ids = [
-            m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
-        ]
-
-        model_list = [m.value for m in supported_model_ids]
-
-        if args.list:
-            headers = ["Model(s)"]
-            rows = []
-            for m in model_list:
-                rows.append(
-                    [
-                        m,
-                    ]
-                )
-            print_table(
-                rows,
-                headers,
-                separate_rows=True,
-            )
-            return
-
-        try:
-            model_id = CoreModelId(args.model_name)
-        except ValueError:
-            self.parser.error(
-                f"{args.model_name} is not a valid Model. Choose one from the list of valid models. "
-                f"Run `llama model list` to see the valid model names."
-            )
-
-        if model_id not in supported_model_ids:
-            self.parser.error(
-                f"{model_id} is not a valid Model. Choose one from the list of valid models. "
-                f"Run `llama model list` to see the valid model names."
-            )
-
-        llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
-        llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
-        llama_3_2_vision_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "vision_prompt_format.md"
-        if model_family(model_id) == ModelFamily.llama3_1:
-            with importlib.resources.as_file(llama_3_1_file) as f:
-                content = f.open("r").read()
-        elif model_family(model_id) == ModelFamily.llama3_2:
-            if is_multimodal(model_id):
-                with importlib.resources.as_file(llama_3_2_vision_file) as f:
-                    content = f.open("r").read()
-            else:
-                with importlib.resources.as_file(llama_3_2_text_file) as f:
-                    content = f.open("r").read()
-
-        render_markdown_to_pager(content)
-
-
-def render_markdown_to_pager(markdown_content: str):
-    from rich.console import Console
-    from rich.markdown import Markdown
-    from rich.style import Style
-    from rich.text import Text
-
-    class LeftAlignedHeaderMarkdown(Markdown):
-        def parse_header(self, token):
-            level = token.type.count("h")
-            content = Text(token.content)
-            header_style = Style(color="bright_blue", bold=True)
-            header = Text(f"{'#' * level} ", style=header_style) + content
-            self.add_text(header)
-
-    # Render the Markdown
-    md = LeftAlignedHeaderMarkdown(markdown_content)
-
-    # Capture the rendered output
-    output = StringIO()
-    console = Console(file=output, force_terminal=True, width=100)  # Set a fixed width
-    console.print(md)
-    rendered_content = output.getvalue()
-    print(rendered_content)
diff --git a/llama_stack/cli/model/remove.py b/llama_stack/cli/model/remove.py
deleted file mode 100644
index 138e06a2a..000000000
--- a/llama_stack/cli/model/remove.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import os
-import shutil
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
-from llama_stack.models.llama.sku_list import resolve_model
-
-
-class ModelRemove(Subcommand):
-    """Remove the downloaded llama model"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "remove",
-            prog="llama model remove",
-            description="Remove the downloaded llama model",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_remove_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model",
-            required=True,
-            help="Specify the llama downloaded model name, see `llama model list --downloaded`",
-        )
-        self.parser.add_argument(
-            "-f",
-            "--force",
-            action="store_true",
-            help="Used to forcefully remove the llama model from the storage without further confirmation",
-        )
-
-    def _run_model_remove_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku_map
-
-        prompt_guard_model_map = prompt_guard_model_sku_map()
-
-        if args.model in prompt_guard_model_map.keys():
-            model = prompt_guard_model_map[args.model]
-        else:
-            model = resolve_model(args.model)
-
-        model_path = os.path.join(DEFAULT_CHECKPOINT_DIR, args.model.replace(":", "-"))
-
-        if model is None or not os.path.isdir(model_path):
-            print(f"'{args.model}' is not a valid llama model or does not exist.")
-            return
-
-        if args.force:
-            shutil.rmtree(model_path)
-            print(f"{args.model} removed.")
-        else:
-            if input(f"Are you sure you want to remove {args.model}? (y/n): ").strip().lower() == "y":
-                shutil.rmtree(model_path)
-                print(f"{args.model} removed.")
-            else:
-                print("Removal aborted.")
diff --git a/llama_stack/cli/model/safety_models.py b/llama_stack/cli/model/safety_models.py
deleted file mode 100644
index e31767f13..000000000
--- a/llama_stack/cli/model/safety_models.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from llama_stack.models.llama.sku_list import LlamaDownloadInfo
-from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat
-
-
-class PromptGuardModel(BaseModel):
-    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
-
-    model_id: str
-    huggingface_repo: str
-    description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
-    is_featured: bool = False
-    max_seq_length: int = 512
-    is_instruct_model: bool = False
-    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
-    arch_args: dict[str, Any] = Field(default_factory=dict)
-
-    def descriptor(self) -> str:
-        return self.model_id
-
-    model_config = ConfigDict(protected_namespaces=())
-
-
-def prompt_guard_model_skus():
-    return [
-        PromptGuardModel(model_id="Prompt-Guard-86M", huggingface_repo="meta-llama/Prompt-Guard-86M"),
-        PromptGuardModel(
-            model_id="Llama-Prompt-Guard-2-86M",
-            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-86M",
-        ),
-        PromptGuardModel(
-            model_id="Llama-Prompt-Guard-2-22M",
-            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-22M",
-        ),
-    ]
-
-
-def prompt_guard_model_sku_map() -> dict[str, Any]:
-    return {model.model_id: model for model in prompt_guard_model_skus()}
-
-
-def prompt_guard_download_info_map() -> dict[str, LlamaDownloadInfo]:
-    return {
-        model.model_id: LlamaDownloadInfo(
-            folder="Prompt-Guard" if model.model_id == "Prompt-Guard-86M" else model.model_id,
-            files=[
-                "model.safetensors",
-                "special_tokens_map.json",
-                "tokenizer.json",
-                "tokenizer_config.json",
-            ],
-            pth_size=1,
-        )
-        for model in prompt_guard_model_skus()
-    }
diff --git a/llama_stack/cli/model/verify_download.py b/llama_stack/cli/model/verify_download.py
deleted file mode 100644
index e7159c0aa..000000000
--- a/llama_stack/cli/model/verify_download.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelVerifyDownload(Subcommand):
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "verify-download",
-            prog="llama model verify-download",
-            description="Verify the downloaded checkpoints' checksums for models downloaded from Meta",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        from llama_stack.cli.verify_download import setup_verify_download_parser
-
-        setup_verify_download_parser(self.parser)
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
deleted file mode 100644
index e738abb4f..000000000
--- a/llama_stack/cli/verify_download.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import hashlib
-from dataclasses import dataclass
-from functools import partial
-from pathlib import Path
-
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-@dataclass
-class VerificationResult:
-    filename: str
-    expected_hash: str
-    actual_hash: str | None
-    exists: bool
-    matches: bool
-
-
-class VerifyDownload(Subcommand):
-    """Llama cli for verifying downloaded model files"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "verify-download",
-            prog="llama verify-download",
-            description="Verify integrity of downloaded model files",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        setup_verify_download_parser(self.parser)
-
-
-def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--model-id",
-        required=True,
-        help="Model ID to verify (only for models downloaded from Meta)",
-    )
-    parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
-
-
-def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
-    sha256_hash = hashlib.sha256()
-    with open(filepath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
-            sha256_hash.update(chunk)
-    return sha256_hash.hexdigest()
-
-
-def load_checksums(checklist_path: Path) -> dict[str, str]:
-    checksums = {}
-    with open(checklist_path) as f:
-        for line in f:
-            if line.strip():
-                sha256sum, filepath = line.strip().split("  ", 1)
-                # Remove leading './' if present
-                filepath = filepath.lstrip("./")
-                checksums[filepath] = sha256sum
-    return checksums
-
-
-def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -> list[VerificationResult]:
-    results = []
-
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console,
-    ) as progress:
-        for filepath, expected_hash in checksums.items():
-            full_path = model_dir / filepath
-            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
-
-            exists = full_path.exists()
-            actual_hash = None
-            matches = False
-
-            if exists:
-                actual_hash = calculate_sha256(full_path)
-                matches = actual_hash == expected_hash
-
-            results.append(
-                VerificationResult(
-                    filename=filepath,
-                    expected_hash=expected_hash,
-                    actual_hash=actual_hash,
-                    exists=exists,
-                    matches=matches,
-                )
-            )
-
-            progress.remove_task(task_id)
-
-    return results
-
-
-def run_verify_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    console = Console()
-    model_dir = Path(model_local_dir(args.model_id))
-    checklist_path = model_dir / "checklist.chk"
-
-    if not model_dir.exists():
-        parser.error(f"Model directory not found: {model_dir}")
-
-    if not checklist_path.exists():
-        parser.error(f"Checklist file not found: {checklist_path}")
-
-    checksums = load_checksums(checklist_path)
-    results = verify_files(model_dir, checksums, console)
-
-    # Print results
-    console.print("\nVerification Results:")
-
-    all_good = True
-    for result in results:
-        if not result.exists:
-            console.print(f"[red]❌ {result.filename}: File not found[/red]")
-            all_good = False
-        elif not result.matches:
-            console.print(
-                f"[red]❌ {result.filename}: Hash mismatch[/red]\n"
-                f"   Expected: {result.expected_hash}\n"
-                f"   Got:      {result.actual_hash}"
-            )
-            all_good = False
-        else:
-            console.print(f"[green]✓ {result.filename}: Verified[/green]")
-
-    if all_good:
-        console.print("\n[green]All files verified successfully![/green]")
diff --git a/llama_stack/distributions/meta-reference-gpu/doc_template.md b/llama_stack/distributions/meta-reference-gpu/doc_template.md
index 92dcc6102..a7e8c2d67 100644
--- a/llama_stack/distributions/meta-reference-gpu/doc_template.md
+++ b/llama_stack/distributions/meta-reference-gpu/doc_template.md
@@ -29,31 +29,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-
-```
-$ llama model list --downloaded
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```
 
 ## Running the Distribution
diff --git a/pyproject.toml b/pyproject.toml
index df441e317..81997c249 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,14 +25,13 @@ classifiers = [
 ]
 dependencies = [
     "aiohttp",
-    "fastapi>=0.115.0,<1.0",                  # server
-    "fire",                                   # for MCP in LLS client
+    "fastapi>=0.115.0,<1.0",                          # server
+    "fire",                                           # for MCP in LLS client
     "httpx",
-    "huggingface-hub>=0.34.0,<1.0",
     "jinja2>=3.1.6",
     "jsonschema",
     "llama-stack-client>=0.2.23",
-    "openai>=1.107",                                # for expires_after support
+    "openai>=1.107",                                  # for expires_after support
     "prompt-toolkit",
     "python-dotenv",
     "python-jose[cryptography]",
@@ -43,13 +42,13 @@ dependencies = [
     "tiktoken",
     "pillow",
     "h11>=0.16.0",
-    "python-multipart>=0.0.20",               # For fastapi Form
-    "uvicorn>=0.34.0",                        # server
-    "opentelemetry-sdk>=1.30.0",              # server
+    "python-multipart>=0.0.20",                       # For fastapi Form
+    "uvicorn>=0.34.0",                                # server
+    "opentelemetry-sdk>=1.30.0",                      # server
     "opentelemetry-exporter-otlp-proto-http>=1.30.0", # server
-    "aiosqlite>=0.21.0",                      # server - for metadata store
-    "asyncpg",                                # for metadata store
-    "sqlalchemy[asyncio]>=2.0.41",           # server - for conversations
+    "aiosqlite>=0.21.0",                              # server - for metadata store
+    "asyncpg",                                        # for metadata store
+    "sqlalchemy[asyncio]>=2.0.41",                    # server - for conversations
 ]
 
 [project.optional-dependencies]
@@ -68,14 +67,14 @@ dev = [
     "pytest-cov",
     "pytest-html",
     "pytest-json-report",
-    "pytest-socket", # For blocking network access in unit tests
-    "nbval", # For notebook testing
+    "pytest-socket",       # For blocking network access in unit tests
+    "nbval",               # For notebook testing
     "black",
     "ruff",
     "types-requests",
     "types-setuptools",
     "pre-commit",
-    "ruamel.yaml", # needed for openapi generator
+    "ruamel.yaml",         # needed for openapi generator
 ]
 # These are the dependencies required for running unit tests.
 unit = [
@@ -141,9 +140,7 @@ docs = [
     "requests",
 ]
 codegen = ["rich", "pydantic>=2.11.9", "jinja2>=3.1.6"]
-benchmark = [
-    "locust>=2.39.1",
-]
+benchmark = ["locust>=2.39.1"]
 
 [project.urls]
 Homepage = "https://github.com/llamastack/llama-stack"
@@ -242,7 +239,6 @@ follow_imports = "silent"
 # to exclude the entire directory.
 exclude = [
     # As we fix more and more of these, we should remove them from the list
-    "^llama_stack/cli/download\\.py$",
     "^llama_stack.core/build\\.py$",
     "^llama_stack.core/client\\.py$",
     "^llama_stack.core/request_headers\\.py$",
@@ -332,6 +328,4 @@ classmethod-decorators = ["classmethod", "pydantic.field_validator"]
 [tool.pytest.ini_options]
 addopts = ["--durations=10"]
 asyncio_mode = "auto"
-markers = [
-    "allow_network: Allow network access for specific unit tests",
-]
+markers = ["allow_network: Allow network access for specific unit tests"]
diff --git a/uv.lock b/uv.lock
index 90b2832d8..11f396799 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.12"
 resolution-markers = [
     "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -1774,7 +1774,6 @@ dependencies = [
     { name = "fire" },
     { name = "h11" },
     { name = "httpx" },
-    { name = "huggingface-hub" },
     { name = "jinja2" },
     { name = "jsonschema" },
     { name = "llama-stack-client" },
@@ -1896,7 +1895,6 @@ requires-dist = [
     { name = "fire" },
     { name = "h11", specifier = ">=0.16.0" },
     { name = "httpx" },
-    { name = "huggingface-hub", specifier = ">=0.34.0,<1.0" },
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "llama-stack-client", specifier = ">=0.2.23" },