chore!: remove model mgmt from CLI for Hugging Face CLI (#3700)

This change removes the `llama model` and `llama download` subcommands from the CLI, replacing them with recommendations to use the Hugging Face CLI instead. Rationale for this change: - The model management functionality was largely duplicating what Hugging Face CLI already provides, leading to unnecessary maintenance overhead (except the download source from Meta?) - Maintaining our own implementation required fixing bugs and keeping up with changes in model repositories and download mechanisms - The Hugging Face CLI is more mature, widely adopted, and better maintained - This allows us to focus on the core Llama Stack functionality rather than reimplementing model management tools Changes made: - Removed all model-related CLI commands and their implementations - Updated documentation to recommend using `huggingface-cli` for model downloads - Removed Meta-specific download logic and statements - Simplified the CLI to focus solely on stack management operations Users should now use: - `huggingface-cli download` for downloading models - `huggingface-cli scan-cache` for listing downloaded models This is a breaking change as it removes previously available CLI commands. Signed-off-by: Sébastien Han <seb@redhat.com>
2025-12-04 18:13:44 +00:00 · 2025-10-10 01:50:33 +02:00 · 2025-10-10 01:50:33 +02:00 · 7ee0ee7843
commit 7ee0ee7843
parent 841d0c3583
21 changed files with 63 additions and 1612 deletions
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -43,7 +43,5 @@ jobs:
        uv pip list
        uv pip show llama-stack
        command -v llama
        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
        llama model list
        llama stack list-apis
        llama stack list-providers inference
--- a/README.md
+++ b/README.md
@ -25,7 +25,7 @@ pip install -U llama_stack
 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
+huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
 # start a llama stack server
 INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -41,31 +41,7 @@ The following environment variables can be configured:
 ## Prerequisite: Downloading Models
-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```
 $ llama model list --downloaded
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ Model                                   ┃ Size     ┃ Modified Time       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
 └─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 ## Running the Distribution
--- a/docs/docs/references/llama_cli_reference/download_models.md
+++ b/docs/docs/references/llama_cli_reference/download_models.md
@ -25,141 +25,42 @@ You have two ways to install Llama Stack:
    cd llama-stack
    pip install -e .
-## Downloading models via CLI
+## Downloading models via Hugging Face CLI
-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.
-To download any model you need the **Model Descriptor**.
+### Install Hugging Face CLI
 This can be obtained by running the command
 ```
 llama model list
 ```
-You should see a table like this:
+First, install the Hugging Face CLI:
 ```
 +----------------------------------+------------------------------------------+----------------+
 | Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
 +----------------------------------+------------------------------------------+----------------+
 ```
 To download models, you can use the llama download command.
 #### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
 Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/). Note: You need to quote the META_URL
 Download the required checkpoints using the following commands:
 ```bash
-# download the 8B model, this can be run on a single GPU
+pip install huggingface_hub[cli]
 llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url 'META_URL'
 # you can also get the 70B model, this will require 8 GPUs however
 llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url 'META_URL'
 # llama-agents have safety enabled by default. For this, you will need
 # safety models -- Llama-Guard and Prompt-Guard
 llama download --source meta --model-id Prompt-Guard-86M --meta-url 'META_URL'
 llama download --source meta --model-id Llama-Guard-3-1B --meta-url 'META_URL'
 ```
-#### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
+### Download models from Hugging Face
-Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
+You can download models using the `huggingface-cli download` command. Here are some examples:
 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 3B Instruct model
 huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct
-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 1B Instruct model
 huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --local-dir ~/.llama/Llama-3.2-1B-Instruct
-llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
+# Download Llama Guard 3 1B model
-llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
+huggingface-cli download meta-llama/Llama-Guard-3-1B --local-dir ~/.llama/Llama-Guard-3-1B
-```
+
-
+# Download Prompt Guard model
-**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
+huggingface-cli download meta-llama/Prompt-Guard-86M --local-dir ~/.llama/Prompt-Guard-86M
 ```{tip}
 Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 ```
 **Important:** You need to authenticate with Hugging Face to download models. You can do this by:
 1. Getting your token from [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
 2. Running `huggingface-cli login` and entering your token
 ## List the downloaded models
-To list the downloaded models with the following command:
+To list the downloaded models, you can use the Hugging Face CLI:
-```
+```bash
-llama model list --downloaded
+# List all downloaded models in your local cache
-```
+huggingface-cli scan-cache
 You should see a table like this:
 ```
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ Model                                   ┃ Size     ┃ Modified Time       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
 └─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
--- a/docs/docs/references/llama_cli_reference/index.md
+++ b/docs/docs/references/llama_cli_reference/index.md
@ -27,9 +27,9 @@ You have two ways to install Llama Stack:
 ## `llama` subcommands
-1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
+1. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
-2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
+
-3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
+For downloading models, we recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli). See [Downloading models](#downloading-models) for more information.
 ### Sample Usage
@ -38,239 +38,41 @@ llama --help
 ```
 ```
-usage: llama [-h] {download,model,stack} ...
+usage: llama [-h] {stack} ...
 Welcome to the Llama CLI
 options:
-  -h, --help            show this help message and exit
+  -h, --help  show this help message and exit
 subcommands:
-  {download,model,stack}
+  {stack}
  stack                 Operations for the Llama Stack / Distributions
 ```
 ## Downloading models
-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.
-To download any model you need the **Model Descriptor**.
+First, install the Hugging Face CLI:
 This can be obtained by running the command
 ```
 llama model list
 ```
 You should see a table like this:
 ```
 +----------------------------------+------------------------------------------+----------------+
 | Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
 +----------------------------------+------------------------------------------+----------------+
 | Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
 +----------------------------------+------------------------------------------+----------------+
 ```
 To download models, you can use the `llama download` command.
 ### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
 Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
 Download the required checkpoints using the following commands:
 ```bash
-# download the 8B model, this can be run on a single GPU
+pip install huggingface_hub[cli]
 llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url META_URL
 # you can also get the 70B model, this will require 8 GPUs however
 llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url META_URL
 # llama-agents have safety enabled by default. For this, you will need
 # safety models -- Llama-Guard and Prompt-Guard
 llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
 llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
 ```
-### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
+Then authenticate and download models:
 Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Authenticate with Hugging Face
 huggingface-cli login
-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
+# Download a model
-
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct
 llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
 llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
 ```
 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
 ```{tip}
 Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 ```
 ## List the downloaded models
-To list the downloaded models with the following command:
+To list the downloaded models, you can use the Hugging Face CLI:
-```
+```bash
-llama model list --downloaded
+# List all downloaded models in your local cache
-```
+huggingface-cli scan-cache
 You should see a table like this:
 ```
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ Model                                   ┃ Size     ┃ Modified Time       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
 └─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 ## Understand the models
 The `llama model` command helps you explore the model’s interface.
 1. `download`: Download the model from different sources. (meta, huggingface)
 2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
 3. `prompt-format`: Show llama model message formats.
 4. `describe`: Describes all the properties of the model.
 ### Sample Usage
 `llama model <subcommand> <options>`
 ```
 llama model --help
 ```
 ```
 usage: llama model [-h] {download,list,prompt-format,describe,verify-download,remove} ...
 Work with llama models
 options:
  -h, --help            show this help message and exit
 model_subcommands:
  {download,list,prompt-format,describe,verify-download,remove}
 ```
 ### Describe
 You can use the describe command to know more about a model:
 ```
 llama model describe -m Llama3.2-3B-Instruct
 ```
 ```
 +-----------------------------+----------------------------------+
 | Model                       | Llama3.2-3B-Instruct             |
 +-----------------------------+----------------------------------+
 | Hugging Face ID             | meta-llama/Llama-3.2-3B-Instruct |
 +-----------------------------+----------------------------------+
 | Description                 | Llama 3.2 3b instruct model      |
 +-----------------------------+----------------------------------+
 | Context Length              | 128K tokens                      |
 +-----------------------------+----------------------------------+
 | Weights format              | bf16                             |
 +-----------------------------+----------------------------------+
 | Model params.json           | {                                |
 |                             |     "dim": 3072,                 |
 |                             |     "n_layers": 28,              |
 |                             |     "n_heads": 24,               |
 |                             |     "n_kv_heads": 8,             |
 |                             |     "vocab_size": 128256,        |
 |                             |     "ffn_dim_multiplier": 1.0,   |
 |                             |     "multiple_of": 256,          |
 |                             |     "norm_eps": 1e-05,           |
 |                             |     "rope_theta": 500000.0,      |
 |                             |     "use_scaled_rope": true      |
 |                             | }                                |
 +-----------------------------+----------------------------------+
 | Recommended sampling params | {                                |
 |                             |     "temperature": 1.0,          |
 |                             |     "top_p": 0.9,                |
 |                             |     "top_k": 0                   |
 |                             | }                                |
 +-----------------------------+----------------------------------+
 ```
 ### Prompt Format
 You can even run `llama model prompt-format` see all of the templates and their tokens:
 ```
 llama model prompt-format -m Llama3.2-3B-Instruct
 ```
 ![alt text](/img/prompt-format.png)
 You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
 **NOTE**: Outputs in terminal are color printed to show special tokens.
 ### Remove model
 You can run `llama model remove` to remove an unnecessary model:
 ```
 llama model remove -m Llama-Guard-3-8B-int8
 ```
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -51,11 +51,11 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "!pip install uv\n",
+        "!pip install uv \"huggingface_hub[cli]\"\n",
        "\n",
        "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
        "# get meta url from llama.com\n",
-        "!uv run --with llama-stack llama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
+        "huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL\n",
        "\n",
        "model_id = f\"meta-llama/{MODEL}\""
      ]
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -1,495 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import asyncio
 import json
 import os
 import shutil
 import sys
 from dataclasses import dataclass
 from datetime import UTC, datetime
 from functools import partial
 from pathlib import Path
 import httpx
 from pydantic import BaseModel, ConfigDict
 from rich.console import Console
 from rich.progress import (
    BarColumn,
    DownloadColumn,
    Progress,
    TextColumn,
    TimeRemainingColumn,
    TransferSpeedColumn,
 )
 from termcolor import cprint
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.models.llama.sku_list import LlamaDownloadInfo
 from llama_stack.models.llama.sku_types import Model
 class Download(Subcommand):
    """Llama cli for downloading llama toolchain assets"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "download",
            prog="llama download",
            description="Download a model from llama.meta.com or Hugging Face Hub",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        setup_download_parser(self.parser)
 def setup_download_parser(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--source",
        choices=["meta", "huggingface"],
        default="meta",
    )
    parser.add_argument(
        "--model-id",
        required=False,
        help="See `llama model list` or `llama model list --show-all` for the list of available models. Specify multiple model IDs with commas, e.g. --model-id Llama3.2-1B,Llama3.2-3B",
    )
    parser.add_argument(
        "--hf-token",
        type=str,
        required=False,
        default=None,
        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
    )
    parser.add_argument(
        "--meta-url",
        type=str,
        required=False,
        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
    )
    parser.add_argument(
        "--max-parallel",
        type=int,
        required=False,
        default=3,
        help="Maximum number of concurrent downloads",
    )
    parser.add_argument(
        "--ignore-patterns",
        type=str,
        required=False,
        default="*.safetensors",
        help="""For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
 safetensors files to avoid downloading duplicate weights.
 """,
    )
    parser.add_argument(
        "--manifest-file",
        type=str,
        help="For source=meta, you can download models from a manifest file containing a file => URL mapping",
        required=False,
    )
    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
@dataclass
 class DownloadTask:
    url: str
    output_file: str
    total_size: int = 0
    downloaded_size: int = 0
    task_id: int | None = None
    retries: int = 0
    max_retries: int = 3
 class DownloadError(Exception):
    pass
 class CustomTransferSpeedColumn(TransferSpeedColumn):
    def render(self, task):
        if task.finished:
            return "-"
        return super().render(task)
 class ParallelDownloader:
    def __init__(
        self,
        max_concurrent_downloads: int = 3,
        buffer_size: int = 1024 * 1024,
        timeout: int = 30,
    ):
        self.max_concurrent_downloads = max_concurrent_downloads
        self.buffer_size = buffer_size
        self.timeout = timeout
        self.console = Console()
        self.progress = Progress(
            TextColumn("[bold blue]{task.description}"),
            BarColumn(bar_width=40),
            "[progress.percentage]{task.percentage:>3.1f}%",
            DownloadColumn(),
            CustomTransferSpeedColumn(),
            TimeRemainingColumn(),
            console=self.console,
            expand=True,
        )
        self.client_options = {
            "timeout": httpx.Timeout(timeout),
            "follow_redirects": True,
        }
    async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
        last_exception = None
        for attempt in range(task.max_retries):
            try:
                return await func(*args, **kwargs)
            except Exception as e:
                last_exception = e
                if attempt < task.max_retries - 1:
                    wait_time = min(30, 2**attempt)  # Cap at 30 seconds
                    self.console.print(
                        f"[yellow]Attempt {attempt + 1}/{task.max_retries} failed, "
                        f"retrying in {wait_time} seconds: {str(e)}[/yellow]"
                    )
                    await asyncio.sleep(wait_time)
                    continue
        raise last_exception
    async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
        if task.total_size > 0:
            self.progress.update(task.task_id, total=task.total_size)
            return
        async def _get_info():
            response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
            response.raise_for_status()
            return response
        try:
            response = await self.retry_with_exponential_backoff(task, _get_info)
            task.url = str(response.url)
            task.total_size = int(response.headers.get("Content-Length", 0))
            if task.total_size == 0:
                raise DownloadError(
                    f"Unable to determine file size for {task.output_file}. "
                    "The server might not support range requests."
                )
            # Update the progress bar's total size once we know it
            if task.task_id is not None:
                self.progress.update(task.task_id, total=task.total_size)
        except httpx.HTTPError as e:
            self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
            raise
    def verify_file_integrity(self, task: DownloadTask) -> bool:
        if not os.path.exists(task.output_file):
            return False
        return os.path.getsize(task.output_file) == task.total_size
    async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
        async def _download_chunk():
            headers = {"Range": f"bytes={start}-{end}"}
            async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
                response.raise_for_status()
                with open(task.output_file, "ab") as file:
                    file.seek(start)
                    async for chunk in response.aiter_bytes(self.buffer_size):
                        file.write(chunk)
                        task.downloaded_size += len(chunk)
                        self.progress.update(
                            task.task_id,
                            completed=task.downloaded_size,
                        )
        try:
            await self.retry_with_exponential_backoff(task, _download_chunk)
        except Exception as e:
            raise DownloadError(
                f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
            ) from e
    async def prepare_download(self, task: DownloadTask) -> None:
        output_dir = os.path.dirname(task.output_file)
        os.makedirs(output_dir, exist_ok=True)
        if os.path.exists(task.output_file):
            task.downloaded_size = os.path.getsize(task.output_file)
    async def download_file(self, task: DownloadTask) -> None:
        try:
            async with httpx.AsyncClient(**self.client_options) as client:
                await self.get_file_info(client, task)
                # Check if file is already downloaded
                if os.path.exists(task.output_file):
                    if self.verify_file_integrity(task):
                        self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
                        self.progress.update(task.task_id, completed=task.total_size)
                        return
                await self.prepare_download(task)
                try:
                    # Split the remaining download into chunks
                    chunk_size = 27_000_000_000  # Cloudfront max chunk size
                    chunks = []
                    current_pos = task.downloaded_size
                    while current_pos < task.total_size:
                        chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
                        chunks.append((current_pos, chunk_end))
                        current_pos = chunk_end + 1
                    # Download chunks in sequence
                    for chunk_start, chunk_end in chunks:
                        await self.download_chunk(client, task, chunk_start, chunk_end)
                except Exception as e:
                    raise DownloadError(f"Download failed: {str(e)}") from e
        except Exception as e:
            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
            raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
    def has_disk_space(self, tasks: list[DownloadTask]) -> bool:
        try:
            total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
            dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
            free_space = shutil.disk_usage(dir_path).free
            # Add 10% buffer for safety
            required_space = int(total_remaining_size * 1.1)
            if free_space < required_space:
                self.console.print(
                    f"[red]Not enough disk space. Required: {required_space // (1024 * 1024)} MB, "
                    f"Available: {free_space // (1024 * 1024)} MB[/red]"
                )
                return False
            return True
        except Exception as e:
            raise DownloadError(f"Failed to check disk space: {str(e)}") from e
    async def download_all(self, tasks: list[DownloadTask]) -> None:
        if not tasks:
            raise ValueError("No download tasks provided")
        if not os.environ.get("LLAMA_DOWNLOAD_NO_SPACE_CHECK") and not self.has_disk_space(tasks):
            raise DownloadError("Insufficient disk space for downloads")
        failed_tasks = []
        with self.progress:
            for task in tasks:
                desc = f"Downloading {Path(task.output_file).name}"
                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
            semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
            async def download_with_semaphore(task: DownloadTask):
                async with semaphore:
                    try:
                        await self.download_file(task)
                    except Exception as e:
                        failed_tasks.append((task, str(e)))
            await asyncio.gather(*(download_with_semaphore(task) for task in tasks))
        if failed_tasks:
            self.console.print("\n[red]Some downloads failed:[/red]")
            for task, error in failed_tasks:
                self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
            raise DownloadError(f"{len(failed_tasks)} downloads failed")
 def _hf_download(
    model: "Model",
    hf_token: str,
    ignore_patterns: str,
    parser: argparse.ArgumentParser,
 ):
    from huggingface_hub import snapshot_download
    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
    from llama_stack.core.utils.model_utils import model_local_dir
    repo_id = model.huggingface_repo
    if repo_id is None:
        raise ValueError(f"No repo id found for model {model.descriptor()}")
    output_dir = model_local_dir(model.descriptor())
    os.makedirs(output_dir, exist_ok=True)
    try:
        true_output_dir = snapshot_download(
            repo_id,
            local_dir=output_dir,
            ignore_patterns=ignore_patterns,
            token=hf_token,
            library_name="llama-stack",
        )
    except GatedRepoError:
        parser.error(
            "It looks like you are trying to access a gated repository. Please ensure you "
            "have access to the repository and have provided the proper Hugging Face API token "
            "using the option `--hf-token` or by running `huggingface-cli login`."
            "You can find your token by visiting https://huggingface.co/settings/tokens"
        )
    except RepositoryNotFoundError:
        parser.error(f"Repository '{repo_id}' not found on the Hugging Face Hub or incorrect Hugging Face token.")
    except Exception as e:
        parser.error(e)
    print(f"\nSuccessfully downloaded model to {true_output_dir}")
 def _meta_download(
    model: "Model",
    model_id: str,
    meta_url: str,
    info: "LlamaDownloadInfo",
    max_concurrent_downloads: int,
 ):
    from llama_stack.core.utils.model_utils import model_local_dir
    output_dir = Path(model_local_dir(model.descriptor()))
    os.makedirs(output_dir, exist_ok=True)
    # Create download tasks for each file
    tasks = []
    for f in info.files:
        output_file = str(output_dir / f)
        url = meta_url.replace("*", f"{info.folder}/{f}")
        total_size = info.pth_size if "consolidated" in f else 0
        tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))
    # Initialize and run parallel downloader
    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
    asyncio.run(downloader.download_all(tasks))
    cprint(f"\nSuccessfully downloaded model to {output_dir}", color="green", file=sys.stderr)
    cprint(
        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
        file=sys.stderr,
    )
    cprint(
        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
        color="yellow",
        file=sys.stderr,
    )
 class ModelEntry(BaseModel):
    model_id: str
    files: dict[str, str]
    model_config = ConfigDict(protected_namespaces=())
 class Manifest(BaseModel):
    models: list[ModelEntry]
    expires_on: datetime
 def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
    from llama_stack.core.utils.model_utils import model_local_dir
    with open(manifest_file) as f:
        d = json.load(f)
        manifest = Manifest(**d)
    if datetime.now(UTC) > manifest.expires_on.astimezone(UTC):
        raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
    console = Console()
    for entry in manifest.models:
        console.print(f"[blue]Downloading model {entry.model_id}...[/blue]")
        output_dir = Path(model_local_dir(entry.model_id))
        os.makedirs(output_dir, exist_ok=True)
        if any(output_dir.iterdir()):
            console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")
            while True:
                resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
                if resp.lower() in ["restart", "r"]:
                    shutil.rmtree(output_dir)
                    os.makedirs(output_dir, exist_ok=True)
                    break
                elif resp.lower() in ["continue", "c"]:
                    console.print("[blue]Continuing download...[/blue]")
                    break
                else:
                    console.print("[red]Invalid response. Please try again.[/red]")
        # Create download tasks for all files in the manifest
        tasks = [
            DownloadTask(url=url, output_file=str(output_dir / fname), max_retries=3)
            for fname, url in entry.files.items()
        ]
        # Initialize and run parallel downloader
        downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
        asyncio.run(downloader.download_all(tasks))
 def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
    """Main download command handler"""
    try:
        if args.manifest_file:
            _download_from_manifest(args.manifest_file, args.max_parallel)
            return
        if args.model_id is None:
            parser.error("Please provide a model id")
            return
        # Handle comma-separated model IDs
        model_ids = [model_id.strip() for model_id in args.model_id.split(",")]
        from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model
        from .model.safety_models import (
            prompt_guard_download_info_map,
            prompt_guard_model_sku_map,
        )
        prompt_guard_model_sku_map = prompt_guard_model_sku_map()
        prompt_guard_download_info_map = prompt_guard_download_info_map()
        for model_id in model_ids:
            if model_id in prompt_guard_model_sku_map.keys():
                model = prompt_guard_model_sku_map[model_id]
                info = prompt_guard_download_info_map[model_id]
            else:
                model = resolve_model(model_id)
                if model is None:
                    parser.error(f"Model {model_id} not found")
                    continue
                info = llama_meta_net_info(model)
            if args.source == "huggingface":
                _hf_download(model, args.hf_token, args.ignore_patterns, parser)
            else:
                meta_url = args.meta_url or input(
                    f"Please provide the signed URL for model {model_id} you received via email "
                    f"after visiting https://www.llama.com/llama-downloads/ "
                    f"(e.g., https://llama3-1.llamameta.net/*?Policy...): "
                )
                if "llamameta.net" not in meta_url:
                    parser.error("Invalid Meta URL provided")
                _meta_download(model, model_id, meta_url, info, args.max_parallel)
    except Exception as e:
        parser.error(f"Download failed: {str(e)}")
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@ -6,11 +6,8 @@
 import argparse
 from .download import Download
 from .model import ModelParser
 from .stack import StackParser
 from .stack.utils import print_subcommand_description
 from .verify_download import VerifyDownload
 class LlamaCLIParser:
@ -30,10 +27,7 @@ class LlamaCLIParser:
        subparsers = self.parser.add_subparsers(title="subcommands")
        # Add sub-commands
        ModelParser.create(subparsers)
        StackParser.create(subparsers)
        Download.create(subparsers)
        VerifyDownload.create(subparsers)
        print_subcommand_description(self.parser, subparsers)
--- a/llama_stack/cli/model/init.py
+++ b/llama_stack/cli/model/init.py
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .model import ModelParser  # noqa
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -1,70 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import json
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.sku_list import resolve_model
 class ModelDescribe(Subcommand):
    """Show details about a model"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "describe",
            prog="llama model describe",
            description="Show details about a llama model",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._run_model_describe_cmd)
    def _add_arguments(self):
        self.parser.add_argument(
            "-m",
            "--model-id",
            type=str,
            required=True,
            help="See `llama model list` or `llama model list --show-all` for the list of available models",
        )
    def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
        from .safety_models import prompt_guard_model_sku_map
        prompt_guard_model_map = prompt_guard_model_sku_map()
        if args.model_id in prompt_guard_model_map.keys():
            model = prompt_guard_model_map[args.model_id]
        else:
            model = resolve_model(args.model_id)
        if model is None:
            self.parser.error(
                f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
            )
            return
        headers = [
            "Model",
            model.descriptor(),
        ]
        rows = [
            ("Hugging Face ID", model.huggingface_repo or "<Not Available>"),
            ("Description", model.description),
            ("Context Length", f"{model.max_seq_length // 1024}K tokens"),
            ("Weights format", model.quantization_format.value),
            ("Model params.json", json.dumps(model.arch_args, indent=4)),
        ]
        print_table(
            rows,
            headers,
            separate_rows=True,
        )
--- a/llama_stack/cli/model/download.py
+++ b/llama_stack/cli/model/download.py
@ -1,24 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 from llama_stack.cli.subcommand import Subcommand
 class ModelDownload(Subcommand):
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "download",
            prog="llama model download",
            description="Download a model from llama.meta.com or Hugging Face Hub",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        from llama_stack.cli.download import setup_download_parser
        setup_download_parser(self.parser)
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -1,119 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import os
 import time
 from pathlib import Path
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.models.llama.sku_list import all_registered_models
 def _get_model_size(model_dir):
    return sum(f.stat().st_size for f in Path(model_dir).rglob("*") if f.is_file())
 def _convert_to_model_descriptor(model):
    for m in all_registered_models():
        if model == m.descriptor().replace(":", "-"):
            return str(m.descriptor())
    return str(model)
 def _run_model_list_downloaded_cmd() -> None:
    headers = ["Model", "Size", "Modified Time"]
    rows = []
    for model in os.listdir(DEFAULT_CHECKPOINT_DIR):
        abs_path = os.path.join(DEFAULT_CHECKPOINT_DIR, model)
        space_usage = _get_model_size(abs_path)
        model_size = f"{space_usage / (1024**3):.2f} GB"
        modified_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(abs_path)))
        rows.append(
            [
                _convert_to_model_descriptor(model),
                model_size,
                modified_time,
            ]
        )
    print_table(
        rows,
        headers,
        separate_rows=True,
    )
 class ModelList(Subcommand):
    """List available llama models"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "list",
            prog="llama model list",
            description="Show available llama models",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._run_model_list_cmd)
    def _add_arguments(self):
        self.parser.add_argument(
            "--show-all",
            action="store_true",
            help="Show all models (not just defaults)",
        )
        self.parser.add_argument(
            "--downloaded",
            action="store_true",
            help="List the downloaded models",
        )
        self.parser.add_argument(
            "-s",
            "--search",
            type=str,
            required=False,
            help="Search for the input string as a substring in the model descriptor(ID)",
        )
    def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
        from .safety_models import prompt_guard_model_skus
        if args.downloaded:
            return _run_model_list_downloaded_cmd()
        headers = [
            "Model Descriptor(ID)",
            "Hugging Face Repo",
            "Context Length",
        ]
        rows = []
        for model in all_registered_models() + prompt_guard_model_skus():
            if not args.show_all and not model.is_featured:
                continue
            descriptor = model.descriptor()
            if not args.search or args.search.lower() in descriptor.lower():
                rows.append(
                    [
                        descriptor,
                        model.huggingface_repo,
                        f"{model.max_seq_length // 1024}K",
                    ]
                )
        if len(rows) == 0:
            print(f"Did not find any model matching `{args.search}`.")
        else:
            print_table(
                rows,
                headers,
                separate_rows=True,
            )
--- a/llama_stack/cli/model/model.py
+++ b/llama_stack/cli/model/model.py
@ -1,43 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 from llama_stack.cli.model.describe import ModelDescribe
 from llama_stack.cli.model.download import ModelDownload
 from llama_stack.cli.model.list import ModelList
 from llama_stack.cli.model.prompt_format import ModelPromptFormat
 from llama_stack.cli.model.remove import ModelRemove
 from llama_stack.cli.model.verify_download import ModelVerifyDownload
 from llama_stack.cli.stack.utils import print_subcommand_description
 from llama_stack.cli.subcommand import Subcommand
 class ModelParser(Subcommand):
    """Llama cli for model interface apis"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "model",
            prog="llama model",
            description="Work with llama models",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        self.parser.set_defaults(func=lambda args: self.parser.print_help())
        subparsers = self.parser.add_subparsers(title="model_subcommands")
        # Add sub-commands
        ModelDownload.create(subparsers)
        ModelList.create(subparsers)
        ModelPromptFormat.create(subparsers)
        ModelDescribe.create(subparsers)
        ModelVerifyDownload.create(subparsers)
        ModelRemove.create(subparsers)
        print_subcommand_description(self.parser, subparsers)
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -1,133 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import textwrap
 from io import StringIO
 from pathlib import Path
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.sku_types import CoreModelId, ModelFamily, is_multimodal, model_family
 ROOT_DIR = Path(__file__).parent.parent.parent
 class ModelPromptFormat(Subcommand):
    """Llama model cli for describe a model prompt format (message formats)"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "prompt-format",
            prog="llama model prompt-format",
            description="Show llama model message formats",
            epilog=textwrap.dedent(
                """
                Example:
                    llama model prompt-format <options>
                """
            ),
            formatter_class=argparse.RawTextHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._run_model_template_cmd)
    def _add_arguments(self):
        self.parser.add_argument(
            "-m",
            "--model-name",
            type=str,
            help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n"
            "(Run `llama model list` to see a list of valid model names)",
        )
        self.parser.add_argument(
            "-l",
            "--list",
            action="store_true",
            help="List all available models",
        )
    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
        import importlib.resources
        # Only Llama 3.1 and 3.2 are supported
        supported_model_ids = [
            m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
        ]
        model_list = [m.value for m in supported_model_ids]
        if args.list:
            headers = ["Model(s)"]
            rows = []
            for m in model_list:
                rows.append(
                    [
                        m,
                    ]
                )
            print_table(
                rows,
                headers,
                separate_rows=True,
            )
            return
        try:
            model_id = CoreModelId(args.model_name)
        except ValueError:
            self.parser.error(
                f"{args.model_name} is not a valid Model. Choose one from the list of valid models. "
                f"Run `llama model list` to see the valid model names."
            )
        if model_id not in supported_model_ids:
            self.parser.error(
                f"{model_id} is not a valid Model. Choose one from the list of valid models. "
                f"Run `llama model list` to see the valid model names."
            )
        llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
        llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
        llama_3_2_vision_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "vision_prompt_format.md"
        if model_family(model_id) == ModelFamily.llama3_1:
            with importlib.resources.as_file(llama_3_1_file) as f:
                content = f.open("r").read()
        elif model_family(model_id) == ModelFamily.llama3_2:
            if is_multimodal(model_id):
                with importlib.resources.as_file(llama_3_2_vision_file) as f:
                    content = f.open("r").read()
            else:
                with importlib.resources.as_file(llama_3_2_text_file) as f:
                    content = f.open("r").read()
        render_markdown_to_pager(content)
 def render_markdown_to_pager(markdown_content: str):
    from rich.console import Console
    from rich.markdown import Markdown
    from rich.style import Style
    from rich.text import Text
    class LeftAlignedHeaderMarkdown(Markdown):
        def parse_header(self, token):
            level = token.type.count("h")
            content = Text(token.content)
            header_style = Style(color="bright_blue", bold=True)
            header = Text(f"{'#' * level} ", style=header_style) + content
            self.add_text(header)
    # Render the Markdown
    md = LeftAlignedHeaderMarkdown(markdown_content)
    # Capture the rendered output
    output = StringIO()
    console = Console(file=output, force_terminal=True, width=100)  # Set a fixed width
    console.print(md)
    rendered_content = output.getvalue()
    print(rendered_content)
--- a/llama_stack/cli/model/remove.py
+++ b/llama_stack/cli/model/remove.py
@ -1,68 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import os
 import shutil
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.models.llama.sku_list import resolve_model
 class ModelRemove(Subcommand):
    """Remove the downloaded llama model"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "remove",
            prog="llama model remove",
            description="Remove the downloaded llama model",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._run_model_remove_cmd)
    def _add_arguments(self):
        self.parser.add_argument(
            "-m",
            "--model",
            required=True,
            help="Specify the llama downloaded model name, see `llama model list --downloaded`",
        )
        self.parser.add_argument(
            "-f",
            "--force",
            action="store_true",
            help="Used to forcefully remove the llama model from the storage without further confirmation",
        )
    def _run_model_remove_cmd(self, args: argparse.Namespace) -> None:
        from .safety_models import prompt_guard_model_sku_map
        prompt_guard_model_map = prompt_guard_model_sku_map()
        if args.model in prompt_guard_model_map.keys():
            model = prompt_guard_model_map[args.model]
        else:
            model = resolve_model(args.model)
        model_path = os.path.join(DEFAULT_CHECKPOINT_DIR, args.model.replace(":", "-"))
        if model is None or not os.path.isdir(model_path):
            print(f"'{args.model}' is not a valid llama model or does not exist.")
            return
        if args.force:
            shutil.rmtree(model_path)
            print(f"{args.model} removed.")
        else:
            if input(f"Are you sure you want to remove {args.model}? (y/n): ").strip().lower() == "y":
                shutil.rmtree(model_path)
                print(f"{args.model} removed.")
            else:
                print("Removal aborted.")
--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@ -1,64 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any
 from pydantic import BaseModel, ConfigDict, Field
 from llama_stack.models.llama.sku_list import LlamaDownloadInfo
 from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat
 class PromptGuardModel(BaseModel):
    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
    model_id: str
    huggingface_repo: str
    description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
    is_featured: bool = False
    max_seq_length: int = 512
    is_instruct_model: bool = False
    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
    arch_args: dict[str, Any] = Field(default_factory=dict)
    def descriptor(self) -> str:
        return self.model_id
    model_config = ConfigDict(protected_namespaces=())
 def prompt_guard_model_skus():
    return [
        PromptGuardModel(model_id="Prompt-Guard-86M", huggingface_repo="meta-llama/Prompt-Guard-86M"),
        PromptGuardModel(
            model_id="Llama-Prompt-Guard-2-86M",
            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-86M",
        ),
        PromptGuardModel(
            model_id="Llama-Prompt-Guard-2-22M",
            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-22M",
        ),
    ]
 def prompt_guard_model_sku_map() -> dict[str, Any]:
    return {model.model_id: model for model in prompt_guard_model_skus()}
 def prompt_guard_download_info_map() -> dict[str, LlamaDownloadInfo]:
    return {
        model.model_id: LlamaDownloadInfo(
            folder="Prompt-Guard" if model.model_id == "Prompt-Guard-86M" else model.model_id,
            files=[
                "model.safetensors",
                "special_tokens_map.json",
                "tokenizer.json",
                "tokenizer_config.json",
            ],
            pth_size=1,
        )
        for model in prompt_guard_model_skus()
    }
--- a/llama_stack/cli/model/verify_download.py
+++ b/llama_stack/cli/model/verify_download.py
@ -1,24 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 from llama_stack.cli.subcommand import Subcommand
 class ModelVerifyDownload(Subcommand):
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "verify-download",
            prog="llama model verify-download",
            description="Verify the downloaded checkpoints' checksums for models downloaded from Meta",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        from llama_stack.cli.verify_download import setup_verify_download_parser
        setup_verify_download_parser(self.parser)
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -1,141 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import argparse
 import hashlib
 from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
 from rich.console import Console
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from llama_stack.cli.subcommand import Subcommand
@dataclass
 class VerificationResult:
    filename: str
    expected_hash: str
    actual_hash: str | None
    exists: bool
    matches: bool
 class VerifyDownload(Subcommand):
    """Llama cli for verifying downloaded model files"""
    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "verify-download",
            prog="llama verify-download",
            description="Verify integrity of downloaded model files",
            formatter_class=argparse.RawTextHelpFormatter,
        )
        setup_verify_download_parser(self.parser)
 def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--model-id",
        required=True,
        help="Model ID to verify (only for models downloaded from Meta)",
    )
    parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
 def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            sha256_hash.update(chunk)
    return sha256_hash.hexdigest()
 def load_checksums(checklist_path: Path) -> dict[str, str]:
    checksums = {}
    with open(checklist_path) as f:
        for line in f:
            if line.strip():
                sha256sum, filepath = line.strip().split("  ", 1)
                # Remove leading './' if present
                filepath = filepath.lstrip("./")
                checksums[filepath] = sha256sum
    return checksums
 def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -> list[VerificationResult]:
    results = []
    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        console=console,
    ) as progress:
        for filepath, expected_hash in checksums.items():
            full_path = model_dir / filepath
            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
            exists = full_path.exists()
            actual_hash = None
            matches = False
            if exists:
                actual_hash = calculate_sha256(full_path)
                matches = actual_hash == expected_hash
            results.append(
                VerificationResult(
                    filename=filepath,
                    expected_hash=expected_hash,
                    actual_hash=actual_hash,
                    exists=exists,
                    matches=matches,
                )
            )
            progress.remove_task(task_id)
    return results
 def run_verify_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
    from llama_stack.core.utils.model_utils import model_local_dir
    console = Console()
    model_dir = Path(model_local_dir(args.model_id))
    checklist_path = model_dir / "checklist.chk"
    if not model_dir.exists():
        parser.error(f"Model directory not found: {model_dir}")
    if not checklist_path.exists():
        parser.error(f"Checklist file not found: {checklist_path}")
    checksums = load_checksums(checklist_path)
    results = verify_files(model_dir, checksums, console)
    # Print results
    console.print("\nVerification Results:")
    all_good = True
    for result in results:
        if not result.exists:
            console.print(f"[red]❌ {result.filename}: File not found[/red]")
            all_good = False
        elif not result.matches:
            console.print(
                f"[red]❌ {result.filename}: Hash mismatch[/red]\n"
                f"   Expected: {result.expected_hash}\n"
                f"   Got:      {result.actual_hash}"
            )
            all_good = False
        else:
            console.print(f"[green]✓ {result.filename}: Verified[/green]")
    if all_good:
        console.print("\n[green]All files verified successfully![/green]")
--- a/llama_stack/distributions/meta-reference-gpu/doc_template.md
+++ b/llama_stack/distributions/meta-reference-gpu/doc_template.md
@ -29,31 +29,7 @@ The following environment variables can be configured:
 ## Prerequisite: Downloading Models
-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```
 $ llama model list --downloaded
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ Model                                   ┃ Size     ┃ Modified Time       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
 └─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 ## Running the Distribution
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,14 +25,13 @@ classifiers = [
 ]
 dependencies = [
    "aiohttp",
-    "fastapi>=0.115.0,<1.0",                  # server
+    "fastapi>=0.115.0,<1.0",                          # server
-    "fire",                                   # for MCP in LLS client
+    "fire",                                           # for MCP in LLS client
    "httpx",
    "huggingface-hub>=0.34.0,<1.0",
    "jinja2>=3.1.6",
    "jsonschema",
    "llama-stack-client>=0.2.23",
-    "openai>=1.107",                                # for expires_after support
+    "openai>=1.107",                                  # for expires_after support
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
@ -43,13 +42,13 @@ dependencies = [
    "tiktoken",
    "pillow",
    "h11>=0.16.0",
-    "python-multipart>=0.0.20",               # For fastapi Form
+    "python-multipart>=0.0.20",                       # For fastapi Form
-    "uvicorn>=0.34.0",                        # server
+    "uvicorn>=0.34.0",                                # server
-    "opentelemetry-sdk>=1.30.0",              # server
+    "opentelemetry-sdk>=1.30.0",                      # server
    "opentelemetry-exporter-otlp-proto-http>=1.30.0", # server
-    "aiosqlite>=0.21.0",                      # server - for metadata store
+    "aiosqlite>=0.21.0",                              # server - for metadata store
-    "asyncpg",                                # for metadata store
+    "asyncpg",                                        # for metadata store
-    "sqlalchemy[asyncio]>=2.0.41",           # server - for conversations
+    "sqlalchemy[asyncio]>=2.0.41",                    # server - for conversations
 ]
 [project.optional-dependencies]
@ -68,14 +67,14 @@ dev = [
    "pytest-cov",
    "pytest-html",
    "pytest-json-report",
-    "pytest-socket", # For blocking network access in unit tests
+    "pytest-socket",       # For blocking network access in unit tests
-    "nbval", # For notebook testing
+    "nbval",               # For notebook testing
    "black",
    "ruff",
    "types-requests",
    "types-setuptools",
    "pre-commit",
-    "ruamel.yaml", # needed for openapi generator
+    "ruamel.yaml",         # needed for openapi generator
 ]
 # These are the dependencies required for running unit tests.
 unit = [
@ -141,9 +140,7 @@ docs = [
    "requests",
 ]
 codegen = ["rich", "pydantic>=2.11.9", "jinja2>=3.1.6"]
-benchmark = [
+benchmark = ["locust>=2.39.1"]
    "locust>=2.39.1",
 ]
 [project.urls]
 Homepage = "https://github.com/llamastack/llama-stack"
@ -242,7 +239,6 @@ follow_imports = "silent"
 # to exclude the entire directory.
 exclude = [
    # As we fix more and more of these, we should remove them from the list
    "^llama_stack/cli/download\\.py$",
    "^llama_stack.core/build\\.py$",
    "^llama_stack.core/client\\.py$",
    "^llama_stack.core/request_headers\\.py$",
@ -332,6 +328,4 @@ classmethod-decorators = ["classmethod", "pydantic.field_validator"]
 [tool.pytest.ini_options]
 addopts = ["--durations=10"]
 asyncio_mode = "auto"
-markers = [
+markers = ["allow_network: Allow network access for specific unit tests"]
    "allow_network: Allow network access for specific unit tests",
 ]
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.12"
 resolution-markers = [
    "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1774,7 +1774,6 @@ dependencies = [
    { name = "fire" },
    { name = "h11" },
    { name = "httpx" },
    { name = "huggingface-hub" },
    { name = "jinja2" },
    { name = "jsonschema" },
    { name = "llama-stack-client" },
@ -1896,7 +1895,6 @@ requires-dist = [
    { name = "fire" },
    { name = "h11", specifier = ">=0.16.0" },
    { name = "httpx" },
    { name = "huggingface-hub", specifier = ">=0.34.0,<1.0" },
    { name = "jinja2", specifier = ">=3.1.6" },
    { name = "jsonschema" },
    { name = "llama-stack-client", specifier = ">=0.2.23" },