Merge 6b616cc780 into 40fdce79b3

2025-06-27 18:50:41 +00:00 · 2025-06-27 10:49:37 +02:00 · 2025-06-27 10:49:37 +02:00 · d9d8f9beff
commit d9d8f9beff
parent 40fdce79b3 6b616cc780
127 changed files with 653 additions and 10772 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -43,14 +43,17 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run llama stack build --template ollama --image-type venv
+          uv run llama stack build --template starter --image-type venv

      - name: Start Llama Stack server in background
        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          ENABLE_OLLAMA: "ollama"
+          OLLAMA_URL: "http://0.0.0.0:11434"
        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/starter/run.yaml --image-type venv &

      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
@ -87,16 +90,18 @@ jobs:
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" # for library tests
+          ENABLE_OLLAMA: "ollama" # for library tests
          OLLAMA_URL: "http://0.0.0.0:11434"
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
-            stack_config="ollama"
+            stack_config="starter"
          else
            stack_config="http://localhost:8321"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
-            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --text-model="ollama/meta-llama/Llama-3.2-3B-Instruct" \
            --embedding-model=all-MiniLM-L6-v2

      - name: Check Storage and Memory Available After Tests
--- a/README.md
+++ b/README.md
@ -139,13 +139,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
-|                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
-|                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
-|                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
-|                      TGI                      |                          [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)                          |             [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html)              |
-|                   Together                    |                     [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)                     |           [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html)           |
-|                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
-| vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |
+|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html)      |


 ### Documentation
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -85,45 +85,13 @@ The following command will allow you to see the available templates and their co
 llama stack build --list-templates
 ```

-```
------------------------------+-----------------------------------------------------------------------------+
-| Template Name                | Description                                                                 |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-| together                     | Use Together.AI for running LLM inference                                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
-+------------------------------+-----------------------------------------------------------------------------+
-| experimental-post-training   | Experimental template for post training                                     |
-+------------------------------+-----------------------------------------------------------------------------+
-| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
-+------------------------------+-----------------------------------------------------------------------------+
-| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
-+------------------------------+-----------------------------------------------------------------------------+
-| tgi                          | Use (an external) TGI server for running LLM inference                      |
-+------------------------------+-----------------------------------------------------------------------------+
-| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
-+------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
-+------------------------------+-----------------------------------------------------------------------------+
-| nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| cerebras                     | Use Cerebras for running LLM inference                                      |
-+------------------------------+-----------------------------------------------------------------------------+
-| ollama                       | Use (an external) Ollama server for running LLM inference                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-```
-
 You may then pick a template to build your distribution with providers fitted to your liking.

 For example, to build a distribution with TGI as the inference provider, you can run:
 ```
-$ llama stack build --template tgi
+$ llama stack build --template starter
 ...
-You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
+You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
 :::
 :::{tab-item} Building from Scratch
@ -163,26 +131,7 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 - The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.

 ```
-$ cat llama_stack/templates/ollama/build.yaml
-
-name: ollama
-distribution_spec:
-  description: Like local, but use ollama for running LLM inference
-  providers:
-    inference: remote::ollama
-    memory: inline::faiss
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
-image_name: ollama
-image_type: conda
-
-# If some providers are external, you can specify the path to the implementation
-external_providers_dir: ~/.llama/providers.d
-```
-
-```
-llama stack build --config llama_stack/templates/ollama/build.yaml
+llama stack build --config llama_stack/templates/starter/build.yaml
 ```
 :::

@ -248,11 +197,11 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.

 ```
-llama stack build --template ollama --image-type container
+llama stack build --template starter --image-type container
 ```

 ```
-$ llama stack build --template ollama --image-type container
+$ llama stack build --template starter --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -6,7 +6,7 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template ollama --image-type venv
+llama stack build --template starter --image-type venv
 ```

 ```python
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -1,79 +0,0 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Bedrock Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::bedrock` |
-| safety | `remote::bedrock` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
-
-### Models
-
-The following models are available by default:
-
- `meta.llama3-1-8b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta.llama3-1-70b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta.llama3-1-405b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
-
-
-## Running Llama Stack with AWS Bedrock
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-bedrock \
-  --port $LLAMA_STACK_PORT \
-  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \
-  --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION
-```
-
-### Via Conda
-
-```bash
-llama stack build --template bedrock --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \
-  --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION
-```
--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@ -1,67 +0,0 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Cerebras Distribution
-
-The `llamastack/distribution-cerebras` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::cerebras`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `llama3.1-8b (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `llama-3.3-70b (aliases: meta-llama/Llama-3.3-70B-Instruct)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/).
-
-
-## Running Llama Stack with Cerebras
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-cerebras \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template cerebras --image-type conda
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
-```
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -1,86 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Fireworks Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-fireworks` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| files | `inline::localfs` |
-| inference | `remote::fireworks`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `accounts/fireworks/models/llama-v3p3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `accounts/fireworks/models/llama-guard-3-8b (aliases: meta-llama/Llama-Guard-3-8B)`
- `accounts/fireworks/models/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
- `accounts/fireworks/models/llama4-scout-instruct-basic (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `accounts/fireworks/models/llama4-maverick-instruct-basic (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `nomic-ai/nomic-embed-text-v1.5 `
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
-
-
-## Running Llama Stack with Fireworks
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-fireworks \
-  --port $LLAMA_STACK_PORT \
-  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template fireworks --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
-```
--- a/docs/source/distributions/self_hosted_distro/groq.md
+++ b/docs/source/distributions/self_hosted_distro/groq.md
@ -1,82 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Groq Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-groq` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::groq` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
-| vector_io | `inline::faiss` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `GROQ_API_KEY`: Groq API Key (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `groq/llama3-8b-8192 (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `groq/llama-3.1-8b-instant `
- `groq/llama3-70b-8192 (aliases: meta-llama/Llama-3-70B-Instruct)`
- `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Groq API Key. You can get one by visiting [Groq](https://api.groq.com/).
-
-
-## Running Llama Stack with Groq
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-groq \
-  --port $LLAMA_STACK_PORT \
-  --env GROQ_API_KEY=$GROQ_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template groq --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env GROQ_API_KEY=$GROQ_API_KEY
-```
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -1,177 +0,0 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# NVIDIA Distribution
-
-The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `inline::localfs`, `remote::nvidia` |
-| eval | `remote::nvidia` |
-| inference | `remote::nvidia` |
-| post_training | `remote::nvidia` |
-| safety | `remote::nvidia` |
-| scoring | `inline::basic` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `inline::rag-runtime` |
-| vector_io | `inline::faiss` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
-
-### Models
-
-The following models are available by default:
-
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
-
-
-## Prerequisites
-### NVIDIA API Keys
-
-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
-
-### Deploy NeMo Microservices Platform
-The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
-
-## Supported Services
-Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
-
-### Inference: NVIDIA NIM
-NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
-  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
-  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
-
-The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
-
-### Datasetio API: NeMo Data Store
-The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
-
-See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
-
-### Eval API: NeMo Evaluator
-The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
-
-### Post-Training API: NeMo Customizer
-The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
-
-### Safety API: NeMo Guardrails
-The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
-
-## Deploying models
-In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
-
-Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
-```sh
-# URL to NeMo NIM Proxy service
-export NEMO_URL="http://nemo.test"
-
-curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-   -H 'accept: application/json' \
-   -H 'Content-Type: application/json' \
-   -d '{
-      "name": "llama-3.2-1b-instruct",
-      "namespace": "meta",
-      "config": {
-         "model": "meta/llama-3.2-1b-instruct",
-         "nim_deployment": {
-            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
-            "image_tag": "1.8.3",
-            "pvc_size": "25Gi",
-            "gpu": 1,
-            "additional_envs": {
-               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
-            }
-         }
-      }
-   }'
-```
-This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
-
-You can also remove a deployed NIM to free up GPU resources, if needed.
-```sh
-export NEMO_URL="http://nemo.test"
-
-curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
-```
-
-## Running Llama Stack with NVIDIA
-
-You can do this via Conda or venv (build code), or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-nvidia \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
-```
-
-### Via Conda
-
-```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type conda
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
-
-### Via venv
-
-If you've set up your local development environment, you can also build the image using your local virtual environment.
-
-```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type venv
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
-
-## Example Notebooks
-For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -1,165 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Ollama Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| files | `inline::localfs` |
-| inference | `remote::ollama` |
-| post_training | `inline::huggingface` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `OLLAMA_URL`: URL of the Ollama server (default: `http://127.0.0.1:11434`)
- `INFERENCE_MODEL`: Inference model loaded into the Ollama server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`)
-
-
-## Setting up Ollama server
-
-Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
-
-In order to load models, you can run:
-
-```bash
-export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
-
-# ollama names this model differently, and we must use the ollama name when loading the model
-export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
-ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
-```
-
-If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
-
-```bash
-export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
-
-# ollama names this model differently, and we must use the ollama name when loading the model
-export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
-ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-export LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  llamastack/distribution-ollama \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-ollama \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
-```
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-export LLAMA_STACK_PORT=8321
-
-llama stack build --template ollama --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://localhost:11434
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env OLLAMA_URL=http://localhost:11434
-```
-
-
-### (Optional) Update Model Serving Configuration
-
-```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
-```
-
-To serve a new model with `ollama`
-```bash
-ollama run <model_name>
-```
-
-To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
-```
-$ ollama ps
-NAME                         ID              SIZE      PROCESSOR    UNTIL
-llama3.2:3b-instruct-fp16    195a8c01d91e    8.6 GB    100% GPU     9 minutes from now
-```
-
-To verify that the model served by ollama is correctly connected to Llama Stack server
-```bash
-$ llama-stack-client models list
-
-Available Models
-
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
-┃ model_type   ┃ identifier                           ┃ provider_resource_id         ┃ metadata  ┃ provider_id ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
-│ llm          │ meta-llama/Llama-3.2-3B-Instruct     │ llama3.2:3b-instruct-fp16    │           │ ollama      │
-└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
-
-Total models: 1
-```
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -1,297 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Remote vLLM Distribution
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::vllm`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-You can use this distribution if you want to run an independent vLLM server for inference.
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100/v1`)
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
-
-
-## Setting up vLLM server
-
-In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
-server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
-[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
-that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
-
-### Setting up vLLM server on AMD GPU
-
-AMD provides two main vLLM container options:
- rocm/vllm: Production-ready container
- rocm/vllm-dev: Development container with the latest vLLM features
-
-Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details.
-
-Here is a sample script to start a ROCm vLLM server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-### Setting up vLLM server on NVIDIA GPU
-
-Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-
-docker run \
-    --pull always \
-    --runtime nvidia \
-    --gpus $CUDA_VISIBLE_DEVICES \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --gpu-memory-utilization 0.7 \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-
-docker run \
-    --pull always \
-    --runtime nvidia \
-    --gpus $CUDA_VISIBLE_DEVICES \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --gpu-memory-utilization 0.7 \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-### Setting up vLLM server on Intel GPU
-
-Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
-
-Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
-export ZE_AFFINITY_MASK=0
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export ZE_AFFINITY_MASK=1
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=8321
-
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
-  llamastack/distribution-remote-vllm \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-remote-vllm \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env SAFETY_VLLM_URL=http://host.docker.internal:$SAFETY_PORT/v1
-```
-
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=8321
-
-cd distributions/remote-vllm
-llama stack build --template remote-vllm --image-type conda
-
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env SAFETY_VLLM_URL=http://localhost:$SAFETY_PORT/v1
-```
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -1,91 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# SambaNova Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-sambanova` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| inference | `remote::sambanova`, `inline::sentence-transformers` |
-| safety | `remote::sambanova` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup).
-
-
-## Running Llama Stack with SambaNova
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-
-### Via Docker
-
-```bash
-LLAMA_STACK_PORT=8321
-llama stack build --template sambanova --image-type container
-docker run \
-  -it \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  distribution-sambanova \
-  --port $LLAMA_STACK_PORT \
-  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
-```
-
-
-### Via Venv
-
-```bash
-llama stack build --template sambanova --image-type venv
-llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
-```
-
-
-### Via Conda
-
-```bash
-llama stack build --template sambanova --image-type conda
-llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
-```
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -0,0 +1,134 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# Starter Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-starter` distribution is a comprehensive, multi-provider distribution that includes most of the available inference providers in Llama Stack. It's designed to be a one-stop solution for developers who want to experiment with different AI providers without having to configure each one individually.
+
+## Provider Composition
+
+The starter distribution consists of the following configurations:
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| files | `inline::localfs` |
+| inference | `remote::openai`, `remote::fireworks`, `remote::together`, `remote::ollama`, `remote::anthropic`, `remote::gemini`, `remote::groq`, `remote::sambanova`, `remote::vllm`, `remote::tgi`, `remote::cerebras`, `remote::llama-openai-compat`, `remote::nvidia`, `remote::hf::serverless`, `remote::hf::endpoint`, `inline::sentence-transformers`, `remote::passthrough` |
+| safety | `inline::llama-guard` |
+| post_training | `inline::huggingface` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
+
+## Inference Providers
+
+The starter distribution includes a comprehensive set of inference providers:
+
+- **OpenAI**: GPT-4, GPT-3.5, O1, O3, O4 models and text embeddings - point to the relevant provider
+  configuration documentation for more details
+- **Fireworks**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and embeddings
+- **Together**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and embeddings
+- **Anthropic**: Claude 3.5 Sonnet, Claude 3.7 Sonnet, Claude 3.5 Haiku, and Voyage embeddings
+- **Gemini**: Gemini 1.5, 2.0, 2.5 models and text embeddings
+- **Groq**: Fast Llama models (3.1, 3.2, 3.3, 4 Scout, 4 Maverick)
+- **SambaNova**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models
+- **Cerebras**: Cerebras AI models
+- **NVIDIA**: NVIDIA NIM models
+- **HuggingFace**: Serverless and endpoint models
+- **Bedrock**: AWS Bedrock models
+- **Passthrough**: Passthrough provider - use this to connect to any other inference provider that is not supported by Llama Stack
+- **Ollama**: Local Ollama models
+- **vLLM**: remote vLLM server
+- **TGI**: Text Generation Inference server - Dell Enterprise Hub's custom TGI container too (use `DEH_URL`)
+- **Sentence Transformers**: Local embedding models
+
+All providers are **disabled** by default. So you need to enable them by setting the environment
+variables. See [Enabling Providers](#enabling-providers) for more details.
+
+## Vector Providers
+
+The starter distribution includes a comprehensive set of vector providers:
+
+- **FAISS**: Local FAISS vector store - enabled by default
+- **SQLite**: Local SQLite vector store - disabled by default
+- **ChromaDB**: Remote ChromaDB server - disabled by default
+- **PGVector**: Remote PGVector server - disabled by default
+
+## Enabling Providers
+
+You can enable specific providers by setting their provider ID to a string value using environment
+variables.
+
+For instance, to enable the Ollama provider, you can set the `ENABLE_OLLAMA` environment variable to `ollama`.
+
+```bash
+export ENABLE_OLLAMA=ollama
+```
+
+To disable a provider, you can set the environment variable to `ENABLE_OLLAMA=__disabled__`.
+
+## Running the Distribution
+
+You can run the starter distribution via Docker or directly using the Llama Stack CLI.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -e ENABLE_OLLAMA=ollama \
+  -e OLLAMA_INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  llamastack/distribution-starter \
+  --port $LLAMA_STACK_PORT
+```
+
+You can also use the `llama stack run` command to run the distribution.
+
+```bash
+llama stack run distributions/starter/run.yaml \
+  --port 8321 \
+  --env ENABLE_OLLAMA=ollama \
+  --env OLLAMA_INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+```
+
+## Storage
+
+The starter distribution uses SQLite for local storage of various components:
+
+- **Metadata store**: `~/.llama/distributions/starter/registry.db`
+- **Inference store**: `~/.llama/distributions/starter/inference_store.db`
+- **FAISS store**: `~/.llama/distributions/starter/faiss_store.db`
+- **SQLite vector store**: `~/.llama/distributions/starter/sqlite_vec.db`
+- **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
+- **Agents store**: `~/.llama/distributions/starter/agents_store.db`
+- **Responses store**: `~/.llama/distributions/starter/responses_store.db`
+- **Trace store**: `~/.llama/distributions/starter/trace_store.db`
+- **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
+- **Dataset I/O stores**: Various HuggingFace and local filesystem stores
+
+## Benefits of the Starter Distribution
+
+1. **Comprehensive Coverage**: Includes most popular AI providers in one distribution
+2. **Flexible Configuration**: Easy to enable/disable providers based on your needs
+3. **No Local GPU Required**: Most providers are cloud-based, making it accessible to developers without high-end hardware
+4. **Easy Migration**: Start with hosted providers and gradually move to local ones as needed
+5. **Production Ready**: Includes safety, evaluation, and telemetry components
+6. **Tool Integration**: Comes with web search, RAG, and model context protocol tools
+
+The starter distribution is ideal for developers who want to experiment with different AI providers, build prototypes quickly, or create applications that can work with multiple AI backends.
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -1,149 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-
-# TGI Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::tgi`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`)
- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
-
-
-## Setting up TGI server
-
-Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8080
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-
-docker run --rm -it \
-  --pull always \
-  -v $HOME/.cache/huggingface:/data \
-  -p $INFERENCE_PORT:$INFERENCE_PORT \
-  --gpus $CUDA_VISIBLE_DEVICES \
-  ghcr.io/huggingface/text-generation-inference:2.3.1 \
-  --dtype bfloat16 \
-  --usage-stats off \
-  --sharded false \
-  --cuda-memory-fraction 0.7 \
-  --model-id $INFERENCE_MODEL \
-  --port $INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-
-docker run --rm -it \
-  --pull always \
-  -v $HOME/.cache/huggingface:/data \
-  -p $SAFETY_PORT:$SAFETY_PORT \
-  --gpus $CUDA_VISIBLE_DEVICES \
-  ghcr.io/huggingface/text-generation-inference:2.3.1 \
-  --dtype bfloat16 \
-  --usage-stats off \
-  --sharded false \
-  --model-id $SAFETY_MODEL \
-  --port $SAFETY_PORT
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-tgi \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-tgi \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
-```
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-llama stack build --template tgi --image-type conda
-llama stack run ./run.yaml
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
-```
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -1,86 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# Together Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-together` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::together`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `TOGETHER_API_KEY`: Together.AI API Key (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta-llama/Llama-3.2-3B-Instruct-Turbo (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
- `meta-llama/Llama-Guard-3-11B-Vision-Turbo (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
- `togethercomputer/m2-bert-80M-8k-retrieval `
- `togethercomputer/m2-bert-80M-32k-retrieval `
- `meta-llama/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct, together/meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct, together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
-
-
-## Running Llama Stack with Together
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-together \
-  --port $LLAMA_STACK_PORT \
-  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template together --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
-```
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -58,7 +58,7 @@ which defines the providers and their settings.
 Now let's build and run the Llama Stack config for Ollama.

 ```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type venv --run
+INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type venv --run
 ```
 :::
 :::{tab-item} Using `conda`
@ -69,7 +69,7 @@ which defines the providers and their settings.
 Now let's build and run the Llama Stack config for Ollama.

 ```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type conda  --image-name llama3-3b-conda --run
+INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type conda  --image-name llama3-3b-conda --run
 ```
 :::
 :::{tab-item} Using a Container
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -17,7 +17,7 @@ ollama run llama3.2:3b --keepalive 60m
 #### Step 2: Run the Llama Stack server
 We will use `uv` to run the Llama Stack server.
 ```bash
-INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template ollama --image-type venv --run
+INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.
--- a/docs/source/providers/post_training/huggingface.md
+++ b/docs/source/providers/post_training/huggingface.md
@ -23,7 +23,7 @@ To use the HF SFTTrainer in your Llama Stack project, follow these steps:
 You can access the HuggingFace trainer via the `ollama` distribution:

 ```bash
-llama stack build --template ollama --image-type venv
+llama stack build --template starter --image-type venv
 llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
 ```

--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -83,7 +83,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 1. **Build the Llama Stack**:
   Build the Llama Stack using the `ollama` template:
   ```bash
-   llama stack build --template ollama --image-type conda
+   llama stack build --template starter --image-type conda
   ```
   **Expected Output:**
   ```bash
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
@ -84,7 +84,13 @@ class ProviderImpl(Providers):
                Each API maps to a dictionary of provider IDs to their health responses.
        """
        providers_health: dict[str, dict[str, HealthResponse]] = {}
-        timeout = 1.0
+
+        # The timeout has to be long enough to allow all the providers to be checked, especially in
+        # the case of the inference router health check since it checks all registered inference
+        # providers.
+        # The timeout must not be equal to the one set by health method for a given implementation,
+        # otherwise we will miss some providers.
+        timeout = 3.0

        async def check_provider_health(impl: Any) -> tuple[str, HealthResponse] | None:
            # Skip special implementations (inspect/providers) that don't have provider specs
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -98,6 +98,10 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):

        method = getattr(impls[api], register_method)
        for obj in objects:
+            # Do not register models on disabled providers
+            if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__":
+                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
+                continue
            # In complex templates, like our starter template, we may have dynamic model ids
            # given by environment variables. This allows those environment variables to have
            # a default value of __disabled__ to skip registration of the model if not set.
@ -106,6 +110,7 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
                and obj.provider_model_id is not None
                and "__disabled__" in obj.provider_model_id
            ):
+                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.")
                continue
            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -26,8 +26,8 @@ class CerebrasImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "base_url": DEFAULT_BASE_URL,
-            "api_key": "${env.CEREBRAS_API_KEY}",
+            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -31,7 +31,7 @@ class LlamaCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY:}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/llama_stack/providers/remote/inference/nvidia/config.py
@ -53,9 +53,15 @@ class NVIDIAConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+    def sample_run_config(
+        cls,
+        url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}",
+        api_key: str = "${env.NVIDIA_API_KEY:+}",
+        append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}",
+        **kwargs,
+    ) -> dict[str, Any]:
        return {
-            "url": "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}",
-            "api_key": "${env.NVIDIA_API_KEY:+}",
-            "append_api_version": "${env.NVIDIA_APPEND_API_VERSION:=True}",
+            "url": url,
+            "api_key": api_key,
+            "append_api_version": append_api_version,
        }
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -13,13 +13,9 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"

 class OllamaImplConfig(BaseModel):
    url: str = DEFAULT_OLLAMA_URL
-    raise_on_connect_error: bool = True

    @classmethod
-    def sample_run_config(
-        cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs
-    ) -> dict[str, Any]:
+    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
        return {
            "url": url,
-            "raise_on_connect_error": raise_on_connect_error,
        }
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -91,7 +91,6 @@ class OllamaInferenceAdapter(
    def __init__(self, config: OllamaImplConfig) -> None:
        self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
        self.url = config.url
-        self.raise_on_connect_error = config.raise_on_connect_error

    @property
    def client(self) -> AsyncClient:
@ -105,10 +104,7 @@ class OllamaInferenceAdapter(
        logger.debug(f"checking connectivity to Ollama at `{self.url}`...")
        health_response = await self.health()
        if health_response["status"] == HealthStatus.ERROR:
-            if self.raise_on_connect_error:
-                raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
-            else:
-                logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
+            raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")

    async def health(self) -> HealthResponse:
        """
--- a/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/llama_stack/providers/remote/inference/passthrough/config.py
@ -24,8 +24,10 @@ class PassthroughImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+    def sample_run_config(
+        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
+    ) -> dict[str, Any]:
        return {
-            "url": "${env.PASSTHROUGH_URL}",
-            "api_key": "${env.PASSTHROUGH_API_KEY}",
+            "url": url,
+            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -17,7 +17,11 @@ class TGIImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, url: str = "${env.TGI_URL}", **kwargs):
+    def sample_run_config(
+        cls,
+        url: str = "${env.TGI_URL}",
+        **kwargs,
+    ):
        return {
            "url": url,
        }
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -327,7 +327,6 @@ class InferenceEndpointAdapter(_HfAdapter):
        # Get the inference endpoint details
        api = HfApi(token=config.api_token.get_secret_value())
        endpoint = api.get_inference_endpoint(config.endpoint_name)
-
        # Wait for the endpoint to be ready (if not already)
        endpoint.wait(timeout=60)

--- a/llama_stack/templates/bedrock/init.py
+++ b/llama_stack/templates/bedrock/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .bedrock import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.distribution.datatypes import Provider, ToolGroupInput
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.bedrock.models import MODEL_ENTRIES
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::bedrock"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["remote::bedrock"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "bedrock"
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-
-    available_models = {
-        "bedrock": MODEL_ENTRIES,
-    }
-    default_models = get_model_registry(available_models)
-
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use AWS Bedrock for running LLM inference and safety",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=default_models,
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-        },
-    )
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@ -1,34 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use AWS Bedrock for running LLM inference and safety
-  providers:
-    inference:
-    - remote::bedrock
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - remote::bedrock
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/bedrock/doc_template.md
+++ b/llama_stack/templates/bedrock/doc_template.md
@ -1,73 +0,0 @@
-# Bedrock Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
-
-{{ providers_table }}
-
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
-
-
-## Running Llama Stack with AWS Bedrock
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \
-  --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION
-```
-
-### Via Conda
-
-```bash
-llama stack build --template {{ name }} --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-  --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-  --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \
-  --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION
-```
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -1,147 +0,0 @@
-version: 2
-image_name: bedrock
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: bedrock
-    provider_type: remote::bedrock
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/faiss_store.db
-  safety:
-  - provider_id: bedrock
-    provider_type: remote::bedrock
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/inference_store.db
-models:
- metadata: {}
-  model_id: meta.llama3-1-8b-instruct-v1:0
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-8b-instruct-v1:0
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-8b-instruct-v1:0
-  model_type: llm
- metadata: {}
-  model_id: meta.llama3-1-70b-instruct-v1:0
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-70b-instruct-v1:0
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-70b-instruct-v1:0
-  model_type: llm
- metadata: {}
-  model_id: meta.llama3-1-405b-instruct-v1:0
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-405b-instruct-v1:0
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: bedrock
-  provider_model_id: meta.llama3-1-405b-instruct-v1:0
-  model_type: llm
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/cerebras/init.py
+++ b/llama_stack/templates/cerebras/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .cerebras import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/cerebras/build.yaml
+++ b/llama_stack/templates/cerebras/build.yaml
@ -1,34 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use Cerebras for running LLM inference
-  providers:
-    inference:
-    - remote::cerebras
-    - inline::sentence-transformers
-    safety:
-    - inline::llama-guard
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    agents:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    telemetry:
-    - inline::meta-reference
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/cerebras/cerebras.py
+++ b/llama_stack/templates/cerebras/cerebras.py
@ -1,110 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig
-from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::cerebras", "inline::sentence-transformers"],
-        "safety": ["inline::llama-guard"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "agents": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "telemetry": ["inline::meta-reference"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-        ],
-    }
-
-    name = "cerebras"
-    inference_provider = Provider(
-        provider_id="cerebras",
-        provider_type="remote::cerebras",
-        config=CerebrasImplConfig.sample_run_config(),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-
-    available_models = {
-        "cerebras": MODEL_ENTRIES,
-    }
-    default_models = get_model_registry(available_models)
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name="cerebras",
-        distro_type="self_hosted",
-        description="Use Cerebras for running LLM inference",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=default_models + [embedding_model],
-                default_shields=[],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "CEREBRAS_API_KEY": (
-                "",
-                "Cerebras API Key",
-            ),
-        },
-    )
--- a/llama_stack/templates/cerebras/doc_template.md
+++ b/llama_stack/templates/cerebras/doc_template.md
@ -1,61 +0,0 @@
-# Cerebras Distribution
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/).
-
-
-## Running Llama Stack with Cerebras
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template cerebras --image-type conda
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
-```
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@ -1,145 +0,0 @@
-version: 2
-image_name: cerebras
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: cerebras
-    provider_type: remote::cerebras
-    config:
-      base_url: https://api.cerebras.ai
-      api_key: ${env.CEREBRAS_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/faiss_store.db
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/responses_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/trace_store.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/inference_store.db
-models:
- metadata: {}
-  model_id: llama3.1-8b
-  provider_id: cerebras
-  provider_model_id: llama3.1-8b
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: cerebras
-  provider_model_id: llama3.1-8b
-  model_type: llm
- metadata: {}
-  model_id: llama-3.3-70b
-  provider_id: cerebras
-  provider_model_id: llama-3.3-70b
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: cerebras
-  provider_model_id: llama-3.3-70b
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/ci-tests/init.py
+++ b/llama_stack/templates/ci-tests/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .ci_tests import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/ci-tests/build.yaml
+++ b/llama_stack/templates/ci-tests/build.yaml
@ -1,35 +0,0 @@
-version: 2
-distribution_spec:
-  description: Distribution for running e2e tests in CI
-  providers:
-    inference:
-    - remote::fireworks
-    - inline::sentence-transformers
-    vector_io:
-    - inline::sqlite-vec
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/ci-tests/ci_tests.py
+++ b/llama_stack/templates/ci-tests/ci_tests.py
@ -1,116 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
-    SQLiteVectorIOConfig,
-)
-from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
-from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::fireworks", "inline::sentence-transformers"],
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "ci-tests"
-    inference_provider = Provider(
-        provider_id="fireworks",
-        provider_type="remote::fireworks",
-        config=FireworksImplConfig.sample_run_config(),
-    )
-    vector_io_provider = Provider(
-        provider_id="sqlite-vec",
-        provider_type="inline::sqlite-vec",
-        config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-    available_models = {
-        "fireworks": MODEL_ENTRIES,
-    }
-    default_models = get_model_registry(available_models)
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Distribution for running e2e tests in CI",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "FIREWORKS_API_KEY": (
-                "",
-                "Fireworks API Key",
-            ),
-        },
-    )
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@ -1,243 +0,0 @@
-version: 2
-image_name: ci-tests
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/inference_store.db
-models:
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-8b
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: nomic-ai/nomic-embed-text-v1.5
-  provider_id: fireworks
-  provider_model_id: nomic-ai/nomic-embed-text-v1.5
-  model_type: embedding
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/dell/init.py
+++ b/llama_stack/templates/dell/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .dell import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/dell/build.yaml
+++ b/llama_stack/templates/dell/build.yaml
@ -1,35 +0,0 @@
-version: 2
-distribution_spec:
-  description: Dell's distribution of Llama Stack. TGI inference via Dell's custom
-    container
-  providers:
-    inference:
-    - remote::tgi
-    - inline::sentence-transformers
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/dell/dell.py
+++ b/llama_stack/templates/dell/dell.py
@ -1,142 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::tgi", "inline::sentence-transformers"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-        ],
-    }
-    name = "dell"
-    inference_provider = Provider(
-        provider_id="tgi0",
-        provider_type="remote::tgi",
-        config={
-            "url": "${env.DEH_URL}",
-        },
-    )
-    safety_inference_provider = Provider(
-        provider_id="tgi1",
-        provider_type="remote::tgi",
-        config={
-            "url": "${env.DEH_SAFETY_URL}",
-        },
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    chromadb_provider = Provider(
-        provider_id="chromadb",
-        provider_type="remote::chromadb",
-        config={
-            "url": "${env.CHROMA_URL}",
-        },
-    )
-
-    inference_model = ModelInput(
-        model_id="${env.INFERENCE_MODEL}",
-        provider_id="tgi0",
-    )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="tgi1",
-    )
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="brave-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Dell's distribution of Llama Stack. TGI inference via Dell's custom container",
-        container_image=None,
-        providers=providers,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [chromadb_provider],
-                },
-                default_models=[inference_model, embedding_model],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        safety_inference_provider,
-                        embedding_provider,
-                    ],
-                    "vector_io": [chromadb_provider],
-                },
-                default_models=[inference_model, safety_model, embedding_model],
-                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "DEH_URL": (
-                "http://0.0.0.0:8181",
-                "URL for the Dell inference server",
-            ),
-            "DEH_SAFETY_URL": (
-                "http://0.0.0.0:8282",
-                "URL for the Dell safety inference server",
-            ),
-            "CHROMA_URL": (
-                "http://localhost:6601",
-                "URL for the Chroma server",
-            ),
-            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model loaded into the TGI server",
-            ),
-            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
-                "Name of the safety (Llama-Guard) model to use",
-            ),
-        },
-    )
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@ -1,178 +0,0 @@
---
-orphan: true
---
-
-# Dell Distribution of Llama Stack
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-
-## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
-
-NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
-
-```bash
-export INFERENCE_PORT=8181
-export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
-export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-export CHROMADB_HOST=localhost
-export CHROMADB_PORT=6601
-export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
-export CUDA_VISIBLE_DEVICES=0
-export LLAMA_STACK_PORT=8321
-
-docker run --rm -it \
-  --pull always \
-  --network host \
-  -v $HOME/.cache/huggingface:/data \
-  -e HF_TOKEN=$HF_TOKEN \
-  -p $INFERENCE_PORT:$INFERENCE_PORT \
-  --gpus $CUDA_VISIBLE_DEVICES \
-  ghcr.io/huggingface/text-generation-inference \
-  --dtype bfloat16 \
-  --usage-stats off \
-  --sharded false \
-  --cuda-memory-fraction 0.7 \
-  --model-id $INFERENCE_MODEL \
-  --port $INFERENCE_PORT --hostname 0.0.0.0
-```
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_INFERENCE_PORT=8282
-export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-
-docker run --rm -it \
-  --pull always \
-  --network host \
-  -v $HOME/.cache/huggingface:/data \
-  -e HF_TOKEN=$HF_TOKEN \
-  -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
-  --gpus $CUDA_VISIBLE_DEVICES \
-  ghcr.io/huggingface/text-generation-inference \
-  --dtype bfloat16 \
-  --usage-stats off \
-  --sharded false \
-  --cuda-memory-fraction 0.7 \
-  --model-id $SAFETY_MODEL \
-  --hostname 0.0.0.0 \
-  --port $SAFETY_INFERENCE_PORT
-```
-
-## Dell distribution relies on ChromaDB for vector database usage
-
-You can start a chroma-db easily using docker.
-```bash
-# This is where the indices are persisted
-mkdir -p $HOME/chromadb
-
-podman run --rm -it \
-  --network host \
-  --name chromadb \
-  -v $HOME/chromadb:/chroma/chroma \
-  -e IS_PERSISTENT=TRUE \
-  chromadb/chroma:latest \
-  --port $CHROMADB_PORT \
-  --host $CHROMADB_HOST
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-docker run -it \
-  --pull always \
-  --network host \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v $HOME/.llama:/root/.llama \
-  # NOTE: mount the llama-stack directory if testing local changes else not needed
-  -v /home/hjshah/git/llama-stack:/app/llama-stack-source \
-  # localhost/distribution-dell:dev if building / testing locally
-  llamastack/distribution-{{ name }}\
-  --port $LLAMA_STACK_PORT  \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
-
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-export SAFETY_INFERENCE_PORT=8282
-export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v $HOME/.llama:/root/.llama \
-  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
-```
-
-### Via Conda
-
-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-llama stack build --template {{ name }} --image-type conda
-llama stack run {{ name }}
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
-```
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -1,134 +0,0 @@
-version: 2
-image_name: dell
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_URL}
-  - provider_id: tgi1
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_SAFETY_URL}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: chromadb
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMA_URL}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: tgi0
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: tgi1
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: ${env.SAFETY_MODEL}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: brave-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -1,125 +0,0 @@
-version: 2
-image_name: dell
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_URL}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: chromadb
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMA_URL}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: tgi0
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: brave-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/experimental-post-training/build.yaml
+++ b/llama_stack/templates/experimental-post-training/build.yaml
@ -1,30 +0,0 @@
-version: '2'
-name: experimental-post-training
-distribution_spec:
-  description: Experimental template for post training
-  container_image: null
-  providers:
-    inference:
-    - inline::meta-reference
-    - remote::ollama
-    eval:
-    - inline::meta-reference
-    scoring:
-    - inline::basic
-    - inline::braintrust
-    post_training:
-    - inline::huggingface
-    datasetio:
-    - inline::localfs
-    - remote::huggingface
-    telemetry:
-    - inline::meta-reference
-    agents:
-    - inline::meta-reference
-    safety:
-    - inline::llama-guard
-    vector_io:
-    - inline::faiss
-    tool_runtime:
-    - remote::brave-search
-image_type: conda
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@ -1,107 +0,0 @@
-version: '2'
-image_name: experimental-post-training
-container_image: null
-conda_env: experimental-post-training
-apis:
- agents
- datasetio
- eval
- inference
- vector_io
- safety
- scoring
- telemetry
- post_training
- tool_runtime
-providers:
-  inference:
-  - provider_id: meta-reference-inference
-    provider_type: inline::meta-reference
-    config:
-      max_seq_len: 4096
-      checkpoint_dir: null
-      create_distributed_process_group: False
-  - provider_id: ollama
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/localfs_datasetio.db
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/huggingface}/huggingface_datasetio.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config: {}
-  post_training:
-  - provider_id: huggingface
-    provider_type: inline::huggingface
-    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/agents_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/faiss_store.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-
-
-metadata_store:
-  namespace: null
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/registry.db
-models: []
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
--- a/llama_stack/templates/fireworks/init.py
+++ b/llama_stack/templates/fireworks/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .fireworks import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@ -1,38 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use Fireworks.AI for running LLM inference
-  providers:
-    inference:
-    - remote::fireworks
-    - inline::sentence-transformers
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    files:
-    - inline::localfs
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - remote::wolfram-alpha
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -1,69 +0,0 @@
---
-orphan: true
---
-# Fireworks Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
-
-
-## Running Llama Stack with Fireworks
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template fireworks --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
-```
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@ -1,177 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
-from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::fireworks", "inline::sentence-transformers"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "files": ["inline::localfs"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "remote::wolfram-alpha",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-
-    name = "fireworks"
-
-    inference_provider = Provider(
-        provider_id="fireworks",
-        provider_type="remote::fireworks",
-        config=FireworksImplConfig.sample_run_config(),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    files_provider = Provider(
-        provider_id="meta-reference-files",
-        provider_type="inline::localfs",
-        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-
-    available_models = {
-        "fireworks": MODEL_ENTRIES,
-    }
-    default_models = get_model_registry(available_models)
-
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::wolfram_alpha",
-            provider_id="wolfram-alpha",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use Fireworks.AI for running LLM inference",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                    "files": [files_provider],
-                },
-                default_models=default_models + [embedding_model],
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        embedding_provider,
-                    ],
-                    "vector_io": [vector_io_provider],
-                    "files": [files_provider],
-                    "safety": [
-                        Provider(
-                            provider_id="llama-guard",
-                            provider_type="inline::llama-guard",
-                            config={},
-                        ),
-                        Provider(
-                            provider_id="llama-guard-vision",
-                            provider_type="inline::llama-guard",
-                            config={},
-                        ),
-                        Provider(
-                            provider_id="code-scanner",
-                            provider_type="inline::code-scanner",
-                            config={},
-                        ),
-                    ],
-                },
-                default_models=[
-                    *default_models,
-                    embedding_model,
-                ],
-                default_shields=[
-                    ShieldInput(
-                        shield_id="meta-llama/Llama-Guard-3-8B",
-                        provider_id="llama-guard",
-                    ),
-                    ShieldInput(
-                        shield_id="meta-llama/Llama-Guard-3-11B-Vision",
-                        provider_id="llama-guard-vision",
-                    ),
-                    ShieldInput(
-                        shield_id="CodeScanner",
-                        provider_id="code-scanner",
-                    ),
-                ],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "FIREWORKS_API_KEY": (
-                "",
-                "Fireworks.AI API Key",
-            ),
-        },
-    )
--- a/llama_stack/templates/fireworks/remote-hosted-report.md
+++ b/llama_stack/templates/fireworks/remote-hosted-report.md
@ -1,45 +0,0 @@
-# Report for fireworks distribution
-
-## Supported Models
-| Model Descriptor | fireworks |
-|:---|:---|
-| meta-llama/Llama-3-8B-Instruct | ❌ |
-| meta-llama/Llama-3-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-8B-Instruct | ❌ |
-| meta-llama/Llama-3.1-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ |
-| meta-llama/Llama-3.2-1B-Instruct | ❌ |
-| meta-llama/Llama-3.2-3B-Instruct | ❌ |
-| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct | ❌ |
-| meta-llama/Llama-Guard-3-11B-Vision | ❌ |
-| meta-llama/Llama-Guard-3-1B | ❌ |
-| meta-llama/Llama-Guard-3-8B | ❌ |
-| meta-llama/Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Text | /chat_completion | streaming | test_text_chat_completion_streaming | ❌ |
-| Vision | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Vision | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Text | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ❌ |
-| Text | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ❌ |
-| Text | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ❌ |
-| Text | /completion | streaming | test_text_completion_streaming | ❌ |
-| Text | /completion | non_streaming | test_text_completion_non_streaming | ❌ |
-| Text | /completion | structured_output | test_text_completion_structured_output | ❌ |
-
-## Memory:
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /insert, /query | inline | test_memory_bank_insert_inline_and_query | ❌ |
-| /insert, /query | url | test_memory_bank_insert_from_url_and_query | ❌ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| create_agent_turn | rag | test_rag_agent | ❌ |
-| create_agent_turn | custom_tool | test_custom_tool | ❌ |
-| create_agent_turn | code_execution | test_code_execution | ❌ |
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@ -1,271 +0,0 @@
-version: 2
-image_name: fireworks
-apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  - provider_id: llama-guard-vision
-    provider_type: inline::llama-guard
-    config: {}
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/fireworks/files}
-      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/files_metadata.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/inference_store.db
-models:
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-8b
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: nomic-ai/nomic-embed-text-v1.5
-  provider_id: fireworks
-  provider_model_id: nomic-ai/nomic-embed-text-v1.5
-  model_type: embedding
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-  provider_id: llama-guard
- shield_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: llama-guard-vision
- shield_id: CodeScanner
-  provider_id: code-scanner
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -1,261 +0,0 @@
-version: 2
-image_name: fireworks
-apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/fireworks/files}
-      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/files_metadata.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/inference_store.db
-models:
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-8b
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: nomic-ai/nomic-embed-text-v1.5
-  provider_id: fireworks
-  provider_model_id: nomic-ai/nomic-embed-text-v1.5
-  model_type: embedding
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/groq/init.py
+++ b/llama_stack/templates/groq/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .groq import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/groq/build.yaml
+++ b/llama_stack/templates/groq/build.yaml
@ -1,31 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use Groq for running LLM inference
-  providers:
-    inference:
-    - remote::groq
-    vector_io:
-    - inline::faiss
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/groq/doc_template.md
+++ b/llama_stack/templates/groq/doc_template.md
@ -1,69 +0,0 @@
---
-orphan: true
---
-# Groq Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a Groq API Key. You can get one by visiting [Groq](https://api.groq.com/).
-
-
-## Running Llama Stack with Groq
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env GROQ_API_KEY=$GROQ_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template groq --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env GROQ_API_KEY=$GROQ_API_KEY
-```
--- a/llama_stack/templates/groq/groq.py
+++ b/llama_stack/templates/groq/groq.py
@ -1,103 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.remote.inference.groq import GroqConfig
-from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::groq"],
-        "vector_io": ["inline::faiss"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-        ],
-    }
-    name = "groq"
-
-    inference_provider = Provider(
-        provider_id=name,
-        provider_type=f"remote::{name}",
-        config=GroqConfig.sample_run_config(),
-    )
-
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-
-    available_models = {
-        "groq": MODEL_ENTRIES,
-    }
-    default_models = get_model_registry(available_models)
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use Groq for running LLM inference",
-        docker_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMASTACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "GROQ_API_KEY": (
-                "",
-                "Groq API Key",
-            ),
-        },
-    )
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@ -1,210 +0,0 @@
-version: 2
-image_name: groq
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/inference_store.db
-models:
- metadata: {}
-  model_id: groq/llama3-8b-8192
-  provider_id: groq
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-3.1-8B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-3.1-8b-instant
-  provider_id: groq
-  provider_model_id: groq/llama-3.1-8b-instant
-  model_type: llm
- metadata: {}
-  model_id: groq/llama3-70b-8192
-  provider_id: groq
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-3-70B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-3.3-70b-versatile
-  provider_id: groq
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-3.3-70B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-3.2-3b-preview
-  provider_id: groq
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-3.2-3B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-4-scout-17b-16e-instruct
-  provider_id: groq
-  provider_model_id: groq/llama-4-scout-17b-16e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-4-scout-17b-16e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/llama-4-maverick-17b-128e-instruct
-  provider_id: groq
-  provider_model_id: groq/llama-4-maverick-17b-128e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-4-maverick-17b-128e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  model_type: llm
- metadata: {}
-  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: groq
-  provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/hf-endpoint/init.py
+++ b/llama_stack/templates/hf-endpoint/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .hf_endpoint import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@ -1,34 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
-  providers:
-    inference:
-    - remote::hf::endpoint
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@ -1,149 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::hf::endpoint"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "hf-endpoint"
-    inference_provider = Provider(
-        provider_id="hf-endpoint",
-        provider_type="remote::hf::endpoint",
-        config=InferenceEndpointImplConfig.sample_run_config(),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-
-    inference_model = ModelInput(
-        model_id="${env.INFERENCE_MODEL}",
-        provider_id="hf-endpoint",
-    )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="hf-endpoint-safety",
-    )
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=[inference_model, embedding_model],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        embedding_provider,
-                        Provider(
-                            provider_id="hf-endpoint-safety",
-                            provider_type="remote::hf::endpoint",
-                            config=InferenceEndpointImplConfig.sample_run_config(
-                                endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
-                            ),
-                        ),
-                    ],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=[
-                    inference_model,
-                    safety_model,
-                    embedding_model,
-                ],
-                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "HF_API_TOKEN": (
-                "hf_...",
-                "Hugging Face API token",
-            ),
-            "INFERENCE_ENDPOINT_NAME": (
-                "",
-                "HF Inference endpoint name for the main inference model",
-            ),
-            "SAFETY_INFERENCE_ENDPOINT_NAME": (
-                "",
-                "HF Inference endpoint for the safety model",
-            ),
-            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model served by the HF Inference Endpoint",
-            ),
-            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
-                "Safety model served by the HF Inference Endpoint",
-            ),
-        },
-    )
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -1,142 +0,0 @@
-version: 2
-image_name: hf-endpoint
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: hf-endpoint
-    provider_type: remote::hf::endpoint
-    config:
-      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
-      api_token: ${env.HF_API_TOKEN}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  - provider_id: hf-endpoint-safety
-    provider_type: remote::hf::endpoint
-    config:
-      endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
-      api_token: ${env.HF_API_TOKEN}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: hf-endpoint
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: hf-endpoint-safety
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: ${env.SAFETY_MODEL}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -1,132 +0,0 @@
-version: 2
-image_name: hf-endpoint
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: hf-endpoint
-    provider_type: remote::hf::endpoint
-    config:
-      endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
-      api_token: ${env.HF_API_TOKEN}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: hf-endpoint
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/hf-serverless/init.py
+++ b/llama_stack/templates/hf-serverless/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .hf_serverless import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@ -1,35 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
-  providers:
-    inference:
-    - remote::hf::serverless
-    - inline::sentence-transformers
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@ -1,142 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::hf::serverless", "inline::sentence-transformers"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-
-    name = "hf-serverless"
-    inference_provider = Provider(
-        provider_id="hf-serverless",
-        provider_type="remote::hf::serverless",
-        config=InferenceAPIImplConfig.sample_run_config(),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-
-    inference_model = ModelInput(
-        model_id="${env.INFERENCE_MODEL}",
-        provider_id="hf-serverless",
-    )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="hf-serverless-safety",
-    )
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=[inference_model, embedding_model],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        embedding_provider,
-                        Provider(
-                            provider_id="hf-serverless-safety",
-                            provider_type="remote::hf::serverless",
-                            config=InferenceAPIImplConfig.sample_run_config(
-                                repo="${env.SAFETY_MODEL}",
-                            ),
-                        ),
-                    ],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=[
-                    inference_model,
-                    safety_model,
-                    embedding_model,
-                ],
-                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "HF_API_TOKEN": (
-                "hf_...",
-                "Hugging Face API token",
-            ),
-            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model to be served by the HF Serverless endpoint",
-            ),
-            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
-                "Safety model to be served by the HF Serverless endpoint",
-            ),
-        },
-    )
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -1,142 +0,0 @@
-version: 2
-image_name: hf-serverless
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: hf-serverless
-    provider_type: remote::hf::serverless
-    config:
-      huggingface_repo: ${env.INFERENCE_MODEL}
-      api_token: ${env.HF_API_TOKEN}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  - provider_id: hf-serverless-safety
-    provider_type: remote::hf::serverless
-    config:
-      huggingface_repo: ${env.SAFETY_MODEL}
-      api_token: ${env.HF_API_TOKEN}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: hf-serverless
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: hf-serverless-safety
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: ${env.SAFETY_MODEL}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -1,132 +0,0 @@
-version: 2
-image_name: hf-serverless
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: hf-serverless
-    provider_type: remote::hf::serverless
-    config:
-      huggingface_repo: ${env.INFERENCE_MODEL}
-      api_token: ${env.HF_API_TOKEN}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: hf-serverless
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/llama_api/init.py
+++ b/llama_stack/templates/llama_api/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .llama_api import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/llama_api/build.yaml
+++ b/llama_stack/templates/llama_api/build.yaml
@ -1,35 +0,0 @@
-version: 2
-distribution_spec:
-  description: Distribution for running e2e tests in CI
-  providers:
-    inference:
-    - remote::llama-openai-compat
-    - inline::sentence-transformers
-    vector_io:
-    - inline::sqlite-vec
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/llama_api/llama_api.py
+++ b/llama_stack/templates/llama_api/llama_api.py
@ -1,153 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
-    SQLiteVectorIOConfig,
-)
-from llama_stack.providers.remote.inference.llama_openai_compat.config import (
-    LlamaCompatConfig,
-)
-from llama_stack.providers.remote.inference.llama_openai_compat.models import (
-    MODEL_ENTRIES as LLLAMA_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.remote.vector_io.pgvector.config import (
-    PGVectorVectorIOConfig,
-)
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_inference_providers() -> tuple[list[Provider], list[ModelInput]]:
-    # in this template, we allow each API key to be optional
-    providers = [
-        (
-            "llama-openai-compat",
-            LLLAMA_MODEL_ENTRIES,
-            LlamaCompatConfig.sample_run_config(api_key="${env.LLAMA_API_KEY:+}"),
-        ),
-    ]
-    inference_providers = []
-    available_models = {}
-    for provider_id, model_entries, config in providers:
-        inference_providers.append(
-            Provider(
-                provider_id=provider_id,
-                provider_type=f"remote::{provider_id}",
-                config=config,
-            )
-        )
-        available_models[provider_id] = model_entries
-    return inference_providers, available_models
-
-
-def get_distribution_template() -> DistributionTemplate:
-    inference_providers, available_models = get_inference_providers()
-    providers = {
-        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "llama_api"
-
-    vector_io_providers = [
-        Provider(
-            provider_id="sqlite-vec",
-            provider_type="inline::sqlite-vec",
-            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB:+chromadb}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:+}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_PGVECTOR:+pgvector}",
-            provider_type="remote::pgvector",
-            config=PGVectorVectorIOConfig.sample_run_config(
-                db="${env.PGVECTOR_DB:+}",
-                user="${env.PGVECTOR_USER:+}",
-                password="${env.PGVECTOR_PASSWORD:+}",
-            ),
-        ),
-    ]
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id=embedding_provider.provider_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-
-    default_models = get_model_registry(available_models)
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Distribution for running e2e tests in CI",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": inference_providers + [embedding_provider],
-                    "vector_io": vector_io_providers,
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-        },
-    )
--- a/llama_stack/templates/llama_api/run.yaml
+++ b/llama_stack/templates/llama_api/run.yaml
@ -1,168 +0,0 @@
-version: 2
-image_name: llama_api
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: llama-openai-compat
-    provider_type: remote::llama-openai-compat
-    config:
-      openai_compat_api_base: https://api.llama.com/compat/v1/
-      api_key: ${env.LLAMA_API_KEY:+}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/sqlite_vec.db
-  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:+}
-  - provider_id: ${env.ENABLE_PGVECTOR:+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:=localhost}
-      port: ${env.PGVECTOR_PORT:=5432}
-      db: ${env.PGVECTOR_DB:+}
-      user: ${env.PGVECTOR_USER:+}
-      password: ${env.PGVECTOR_PASSWORD:+}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/inference_store.db
-models:
- metadata: {}
-  model_id: Llama-3.3-70B-Instruct
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-3.3-70B-Instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-3.3-70B-Instruct
-  model_type: llm
- metadata: {}
-  model_id: Llama-4-Scout-17B-16E-Instruct-FP8
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
-  model_type: llm
- metadata: {}
-  model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: llama-openai-compat
-  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/nvidia/init.py
+++ b/llama_stack/templates/nvidia/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .nvidia import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@ -1,29 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use NVIDIA NIM for running LLM inference, evaluation and safety
-  providers:
-    inference:
-    - remote::nvidia
-    vector_io:
-    - inline::faiss
-    safety:
-    - remote::nvidia
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - remote::nvidia
-    post_training:
-    - remote::nvidia
-    datasetio:
-    - inline::localfs
-    - remote::nvidia
-    scoring:
-    - inline::basic
-    tool_runtime:
-    - inline::rag-runtime
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@ -1,149 +0,0 @@
-# NVIDIA Distribution
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
-
-
-## Prerequisites
-### NVIDIA API Keys
-
-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
-
-### Deploy NeMo Microservices Platform
-The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
-
-## Supported Services
-Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
-
-### Inference: NVIDIA NIM
-NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
-  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
-  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
-
-The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
-
-### Datasetio API: NeMo Data Store
-The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
-
-See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
-
-### Eval API: NeMo Evaluator
-The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
-
-### Post-Training API: NeMo Customizer
-The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
-
-### Safety API: NeMo Guardrails
-The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
-
-## Deploying models
-In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
-
-Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
-```sh
-# URL to NeMo NIM Proxy service
-export NEMO_URL="http://nemo.test"
-
-curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-   -H 'accept: application/json' \
-   -H 'Content-Type: application/json' \
-   -d '{
-      "name": "llama-3.2-1b-instruct",
-      "namespace": "meta",
-      "config": {
-         "model": "meta/llama-3.2-1b-instruct",
-         "nim_deployment": {
-            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
-            "image_tag": "1.8.3",
-            "pvc_size": "25Gi",
-            "gpu": 1,
-            "additional_envs": {
-               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
-            }
-         }
-      }
-   }'
-```
-This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
-
-You can also remove a deployed NIM to free up GPU resources, if needed.
-```sh
-export NEMO_URL="http://nemo.test"
-
-curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
-```
-
-## Running Llama Stack with NVIDIA
-
-You can do this via Conda or venv (build code), or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
-```
-
-### Via Conda
-
-```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type conda
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
-
-### Via venv
-
-If you've set up your local development environment, you can also build the image using your local virtual environment.
-
-```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type venv
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
-
-## Example Notebooks
-For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
-from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
-from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
-from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
-from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
-from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::nvidia"],
-        "vector_io": ["inline::faiss"],
-        "safety": ["remote::nvidia"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["remote::nvidia"],
-        "post_training": ["remote::nvidia"],
-        "datasetio": ["inline::localfs", "remote::nvidia"],
-        "scoring": ["inline::basic"],
-        "tool_runtime": ["inline::rag-runtime"],
-    }
-
-    inference_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NVIDIAConfig.sample_run_config(),
-    )
-    safety_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NVIDIASafetyConfig.sample_run_config(),
-    )
-    datasetio_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NvidiaDatasetIOConfig.sample_run_config(),
-    )
-    eval_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NVIDIAEvalConfig.sample_run_config(),
-    )
-    inference_model = ModelInput(
-        model_id="${env.INFERENCE_MODEL}",
-        provider_id="nvidia",
-    )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="nvidia",
-    )
-
-    available_models = {
-        "nvidia": MODEL_ENTRIES,
-    }
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    default_models = get_model_registry(available_models)
-    return DistributionTemplate(
-        name="nvidia",
-        distro_type="self_hosted",
-        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider],
-                    "datasetio": [datasetio_provider],
-                    "eval": [eval_provider],
-                },
-                default_models=default_models,
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        safety_provider,
-                    ],
-                    "eval": [eval_provider],
-                },
-                default_models=[inference_model, safety_model],
-                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "NVIDIA_API_KEY": (
-                "",
-                "NVIDIA API Key",
-            ),
-            "NVIDIA_APPEND_API_VERSION": (
-                "True",
-                "Whether to append the API version to the base_url",
-            ),
-            ## Nemo Customizer related variables
-            "NVIDIA_DATASET_NAMESPACE": (
-                "default",
-                "NVIDIA Dataset Namespace",
-            ),
-            "NVIDIA_PROJECT_ID": (
-                "test-project",
-                "NVIDIA Project ID",
-            ),
-            "NVIDIA_CUSTOMIZER_URL": (
-                "https://customizer.api.nvidia.com",
-                "NVIDIA Customizer URL",
-            ),
-            "NVIDIA_OUTPUT_MODEL_DIR": (
-                "test-example-model@v1",
-                "NVIDIA Output Model Directory",
-            ),
-            "GUARDRAILS_SERVICE_URL": (
-                "http://0.0.0.0:7331",
-                "URL for the NeMo Guardrails Service",
-            ),
-            "NVIDIA_GUARDRAILS_CONFIG_ID": (
-                "self-check",
-                "NVIDIA Guardrail Configuration ID",
-            ),
-            "NVIDIA_EVALUATOR_URL": (
-                "http://0.0.0.0:7331",
-                "URL for the NeMo Evaluator Service",
-            ),
-            "INFERENCE_MODEL": (
-                "Llama3.1-8B-Instruct",
-                "Inference model",
-            ),
-            "SAFETY_MODEL": (
-                "meta/llama-3.1-8b-instruct",
-                "Name of the model to use for safety",
-            ),
-        },
-    )
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -1,121 +0,0 @@
-version: 2
-image_name: nvidia
-apis:
- agents
- datasetio
- eval
- inference
- post_training
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:+}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/faiss_store.db
-  safety:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  post_training:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:+}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
-  datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/localfs_datasetio.db
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:+}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: nvidia
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: nvidia
-  model_type: llm
-shields:
- shield_id: ${env.SAFETY_MODEL}
-  provider_id: nvidia
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -1,227 +0,0 @@
-version: 2
-image_name: nvidia
-apis:
- agents
- datasetio
- eval
- inference
- post_training
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:+}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/faiss_store.db
-  safety:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  post_training:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:+}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
-  datasetio:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:+}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/inference_store.db
-models:
- metadata: {}
-  model_id: meta/llama3-8b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3-8B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama3-70b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.1-8b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.1-70b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.1-405b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-405b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.2-1b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-1b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-1b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.2-3b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-3b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.2-11b-vision-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-11b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.2-90b-vision-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-90b-vision-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta/llama-3.3-70b-instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.3-70b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.3-70b-instruct
-  model_type: llm
- metadata:
-    embedding_dimension: 2048
-    context_length: 8192
-  model_id: nvidia/llama-3.2-nv-embedqa-1b-v2
-  provider_id: nvidia
-  provider_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2
-  model_type: embedding
- metadata:
-    embedding_dimension: 1024
-    context_length: 512
-  model_id: nvidia/nv-embedqa-e5-v5
-  provider_id: nvidia
-  provider_model_id: nvidia/nv-embedqa-e5-v5
-  model_type: embedding
- metadata:
-    embedding_dimension: 4096
-    context_length: 512
-  model_id: nvidia/nv-embedqa-mistral-7b-v2
-  provider_id: nvidia
-  provider_model_id: nvidia/nv-embedqa-mistral-7b-v2
-  model_type: embedding
- metadata:
-    embedding_dimension: 1024
-    context_length: 512
-  model_id: snowflake/arctic-embed-l
-  provider_id: nvidia
-  provider_model_id: snowflake/arctic-embed-l
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/ollama/init.py
+++ b/llama_stack/templates/ollama/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .ollama import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -1,39 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use (an external) Ollama server for running LLM inference
-  providers:
-    inference:
-    - remote::ollama
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    files:
-    - inline::localfs
-    post_training:
-    - inline::huggingface
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-    - remote::wolfram-alpha
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -1,152 +0,0 @@
---
-orphan: true
---
-# Ollama Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-
-## Setting up Ollama server
-
-Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
-
-In order to load models, you can run:
-
-```bash
-export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
-
-# ollama names this model differently, and we must use the ollama name when loading the model
-export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
-ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
-```
-
-If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
-
-```bash
-export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
-
-# ollama names this model differently, and we must use the ollama name when loading the model
-export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
-ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-export LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
-```
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-export LLAMA_STACK_PORT=8321
-
-llama stack build --template {{ name }} --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://localhost:11434
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env OLLAMA_URL=http://localhost:11434
-```
-
-
-### (Optional) Update Model Serving Configuration
-
-```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
-```
-
-To serve a new model with `ollama`
-```bash
-ollama run <model_name>
-```
-
-To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
-```
-$ ollama ps
-NAME                         ID              SIZE      PROCESSOR    UNTIL
-llama3.2:3b-instruct-fp16    195a8c01d91e    8.6 GB    100% GPU     9 minutes from now
-```
-
-To verify that the model served by ollama is correctly connected to Llama Stack server
-```bash
-$ llama-stack-client models list
-
-Available Models
-
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
-┃ model_type   ┃ identifier                           ┃ provider_resource_id         ┃ metadata  ┃ provider_id ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
-│ llm          │ meta-llama/Llama-3.2-3B-Instruct     │ llama3.2:3b-instruct-fp16    │           │ ollama      │
-└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
-
-Total models: 1
-```
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -1,169 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
-from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::ollama"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "files": ["inline::localfs"],
-        "post_training": ["inline::huggingface"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-            "remote::wolfram-alpha",
-        ],
-    }
-    name = "ollama"
-    inference_provider = Provider(
-        provider_id="ollama",
-        provider_type="remote::ollama",
-        config=OllamaImplConfig.sample_run_config(),
-    )
-    vector_io_provider_faiss = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    files_provider = Provider(
-        provider_id="meta-reference-files",
-        provider_type="inline::localfs",
-        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    posttraining_provider = Provider(
-        provider_id="huggingface",
-        provider_type="inline::huggingface",
-        config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-    inference_model = ModelInput(
-        model_id="${env.INFERENCE_MODEL}",
-        provider_id="ollama",
-    )
-    safety_model = ModelInput(
-        model_id="${env.SAFETY_MODEL}",
-        provider_id="ollama",
-    )
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="ollama",
-        provider_model_id="all-minilm:latest",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::wolfram_alpha",
-            provider_id="wolfram-alpha",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use (an external) Ollama server for running LLM inference",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider],
-                    "vector_io": [vector_io_provider_faiss],
-                    "files": [files_provider],
-                    "post_training": [posttraining_provider],
-                },
-                default_models=[inference_model, embedding_model],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider],
-                    "vector_io": [vector_io_provider_faiss],
-                    "files": [files_provider],
-                    "post_training": [posttraining_provider],
-                    "safety": [
-                        Provider(
-                            provider_id="llama-guard",
-                            provider_type="inline::llama-guard",
-                            config={},
-                        ),
-                        Provider(
-                            provider_id="code-scanner",
-                            provider_type="inline::code-scanner",
-                            config={},
-                        ),
-                    ],
-                },
-                default_models=[
-                    inference_model,
-                    safety_model,
-                    embedding_model,
-                ],
-                default_shields=[
-                    ShieldInput(
-                        shield_id="${env.SAFETY_MODEL}",
-                        provider_id="llama-guard",
-                    ),
-                    ShieldInput(
-                        shield_id="CodeScanner",
-                        provider_id="code-scanner",
-                    ),
-                ],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "OLLAMA_URL": (
-                "http://127.0.0.1:11434",
-                "URL of the Ollama server",
-            ),
-            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model loaded into the Ollama server",
-            ),
-            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
-                "Safety model loaded into the Ollama server",
-            ),
-        },
-    )
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -1,163 +0,0 @@
-version: 2
-image_name: ollama
-apis:
- agents
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: ollama
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-      raise_on_connect_error: true
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ollama/files}
-      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/files_metadata.db
-  post_training:
-  - provider_id: huggingface
-    provider_type: inline::huggingface
-    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: ollama
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: ollama
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
-shields:
- shield_id: ${env.SAFETY_MODEL}
-  provider_id: llama-guard
- shield_id: CodeScanner
-  provider_id: code-scanner
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
-server:
-  port: 8321
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -1,153 +0,0 @@
-version: 2
-image_name: ollama
-apis:
- agents
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: ollama
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-      raise_on_connect_error: true
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ollama/files}
-      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/files_metadata.db
-  post_training:
-  - provider_id: huggingface
-    provider_type: inline::huggingface
-    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: ollama
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
-server:
-  port: 8321
--- a/llama_stack/templates/passthrough/init.py
+++ b/llama_stack/templates/passthrough/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .passthrough import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/passthrough/build.yaml
+++ b/llama_stack/templates/passthrough/build.yaml
@ -1,36 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use Passthrough hosted llama-stack endpoint for LLM inference
-  providers:
-    inference:
-    - remote::passthrough
-    - inline::sentence-transformers
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - remote::wolfram-alpha
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/passthrough/doc_template.md
+++ b/llama_stack/templates/passthrough/doc_template.md
@ -1,35 +0,0 @@
---
-orphan: true
---
-# Passthrough Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
-
-{{ providers_table }}
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-{% if default_models %}
-### Models
-
-The following models are available by default:
-
-{% for model in default_models %}
- `{{ model.model_id }} {{ model.doc_string }}`
-{% endfor %}
-{% endif %}
--- a/llama_stack/templates/passthrough/passthrough.py
+++ b/llama_stack/templates/passthrough/passthrough.py
@ -1,193 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pathlib import Path
-
-from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
-from llama_stack.providers.remote.inference.passthrough.config import (
-    PassthroughImplConfig,
-)
-from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
-
-
-def get_distribution_template() -> DistributionTemplate:
-    providers = {
-        "inference": ["remote::passthrough", "inline::sentence-transformers"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "remote::wolfram-alpha",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-
-    name = "passthrough"
-
-    inference_provider = Provider(
-        provider_id="passthrough",
-        provider_type="remote::passthrough",
-        config=PassthroughImplConfig.sample_run_config(),
-    )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    vector_io_provider = Provider(
-        provider_id="faiss",
-        provider_type="inline::faiss",
-        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
-
-    default_models = [
-        ModelInput(
-            metadata={},
-            model_id="meta-llama/Llama-3.1-8B-Instruct",
-            provider_id="passthrough",
-            provider_model_id="llama3.1-8b-instruct",
-            model_type=ModelType.llm,
-        ),
-        ModelInput(
-            metadata={},
-            model_id="meta-llama/Llama-3.2-11B-Vision-Instruct",
-            provider_id="passthrough",
-            provider_model_id="llama3.2-11b-vision-instruct",
-            model_type=ModelType.llm,
-        ),
-    ]
-
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::wolfram_alpha",
-            provider_id="wolfram-alpha",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Use Passthrough hosted llama-stack endpoint for LLM inference",
-        container_image=None,
-        template_path=Path(__file__).parent / "doc_template.md",
-        providers=providers,
-        available_models_by_provider={
-            "passthrough": [
-                ProviderModelEntry(
-                    provider_model_id="llama3.1-8b-instruct",
-                    model_type=ModelType.llm,
-                ),
-                ProviderModelEntry(
-                    provider_model_id="llama3.2-11b-vision-instruct",
-                    model_type=ModelType.llm,
-                ),
-            ],
-        },
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
-                },
-                default_models=default_models + [embedding_model],
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                default_tool_groups=default_tool_groups,
-            ),
-            "run-with-safety.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": [
-                        inference_provider,
-                        embedding_provider,
-                    ],
-                    "vector_io": [vector_io_provider],
-                    "safety": [
-                        Provider(
-                            provider_id="llama-guard",
-                            provider_type="inline::llama-guard",
-                            config={},
-                        ),
-                        Provider(
-                            provider_id="llama-guard-vision",
-                            provider_type="inline::llama-guard",
-                            config={},
-                        ),
-                        Provider(
-                            provider_id="code-scanner",
-                            provider_type="inline::code-scanner",
-                            config={},
-                        ),
-                    ],
-                },
-                default_models=[
-                    *default_models,
-                    embedding_model,
-                ],
-                default_shields=[
-                    ShieldInput(
-                        shield_id="meta-llama/Llama-Guard-3-8B",
-                        provider_id="llama-guard",
-                    ),
-                    ShieldInput(
-                        shield_id="meta-llama/Llama-Guard-3-11B-Vision",
-                        provider_id="llama-guard-vision",
-                    ),
-                    ShieldInput(
-                        shield_id="CodeScanner",
-                        provider_id="code-scanner",
-                    ),
-                ],
-                default_tool_groups=default_tool_groups,
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "PASSTHROUGH_API_KEY": (
-                "",
-                "Passthrough API Key",
-            ),
-            "PASSTHROUGH_URL": (
-                "",
-                "Passthrough URL",
-            ),
-        },
-    )
--- a/llama_stack/templates/passthrough/run-with-safety.yaml
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@ -1,155 +0,0 @@
-version: 2
-image_name: passthrough
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: passthrough
-    provider_type: remote::passthrough
-    config:
-      url: ${env.PASSTHROUGH_URL}
-      api_key: ${env.PASSTHROUGH_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  - provider_id: llama-guard-vision
-    provider_type: inline::llama-guard
-    config: {}
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/inference_store.db
-models:
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: passthrough
-  provider_model_id: llama3.1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: passthrough
-  provider_model_id: llama3.2-11b-vision-instruct
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-  provider_id: llama-guard
- shield_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: llama-guard-vision
- shield_id: CodeScanner
-  provider_id: code-scanner
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@ -1,145 +0,0 @@
-version: 2
-image_name: passthrough
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: passthrough
-    provider_type: remote::passthrough
-    config:
-      url: ${env.PASSTHROUGH_URL}
-      api_key: ${env.PASSTHROUGH_API_KEY}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/inference_store.db
-models:
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: passthrough
-  provider_model_id: llama3.1-8b-instruct
-  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: passthrough
-  provider_model_id: llama3.2-11b-vision-instruct
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
--- a/llama_stack/templates/remote-vllm/init.py
+++ b/llama_stack/templates/remote-vllm/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vllm import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -1,36 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use (an external) vLLM server for running LLM inference
-  providers:
-    inference:
-    - remote::vllm
-    - inline::sentence-transformers
-    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    telemetry:
-    - inline::meta-reference
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-    - remote::wolfram-alpha
-image_type: conda
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -1,284 +0,0 @@
---
-orphan: true
---
-# Remote vLLM Distribution
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
-
-{{ providers_table }}
-
-You can use this distribution if you want to run an independent vLLM server for inference.
-
-{% if run_config_env_vars %}
-### Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
-{% endfor %}
-{% endif %}
-
-
-## Setting up vLLM server
-
-In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
-server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
-[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
-that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
-
-### Setting up vLLM server on AMD GPU
-
-AMD provides two main vLLM container options:
- rocm/vllm: Production-ready container
- rocm/vllm-dev: Development container with the latest vLLM features
-
-Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details.
-
-Here is a sample script to start a ROCm vLLM server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-### Setting up vLLM server on NVIDIA GPU
-
-Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-
-docker run \
-    --pull always \
-    --runtime nvidia \
-    --gpus $CUDA_VISIBLE_DEVICES \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --gpu-memory-utilization 0.7 \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-
-docker run \
-    --pull always \
-    --runtime nvidia \
-    --gpus $CUDA_VISIBLE_DEVICES \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --gpu-memory-utilization 0.7 \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-### Setting up vLLM server on Intel GPU
-
-Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
-
-Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
-export ZE_AFFINITY_MASK=0
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export ZE_AFFINITY_MASK=1
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-## Running Llama Stack
-
-Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=8321
-
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-# You need a local checkout of llama-stack to run this, get it using
-# git clone https://github.com/meta-llama/llama-stack.git
-cd /path/to/llama-stack
-
-docker run \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
-  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env SAFETY_VLLM_URL=http://host.docker.internal:$SAFETY_PORT/v1
-```
-
-
-### Via Conda
-
-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=8321
-
-cd distributions/remote-vllm
-llama stack build --template remote-vllm --image-type conda
-
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1
-```
-
-If you are using Llama Stack Safety / Shield APIs, use:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env SAFETY_VLLM_URL=http://localhost:$SAFETY_PORT/v1
-```
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -1,152 +0,0 @@
-version: 2
-image_name: remote-vllm
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: vllm-safety
-    provider_type: remote::vllm
-    config:
-      url: ${env.SAFETY_VLLM_URL}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/responses_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/trace_store.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: vllm-inference
-  model_type: llm
- metadata: {}
-  model_id: ${env.SAFETY_MODEL}
-  provider_id: vllm-safety
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
- shield_id: ${env.SAFETY_MODEL}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
-server:
-  port: 8321
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -1,140 +0,0 @@
-version: 2
-image_name: remote-vllm
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/faiss_store.db
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/responses_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:+}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/trace_store.db
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-  - provider_id: wolfram-alpha
-    provider_type: remote::wolfram-alpha
-    config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/inference_store.db
-models:
- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: vllm-inference
-  model_type: llm
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
-  provider_id: wolfram-alpha
-server:
-  port: 8321
--- a/Show more
+++ b/Show more