Add eval/scoring/datasetio API providers to distribution templates & UI developer guide (#564)

# What does this PR do? - add /eval, /scoring, /datasetio API providers to distribution templates - regenerate build.yaml / run.yaml files - fix `template.py` to take in list of providers instead of only first one - override memory provider as faiss default for all distro (as only 1 memory provider is needed to start basic flow, chromadb/pgvector need additional setup step). ``` python llama_stack/scripts/distro_codegen.py ``` - updated README to start UI via conda builds. ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` - Use newly generated `run.yaml` to start server ``` llama stack run ./llama_stack/templates/together/run.yaml ``` <img width="1191" alt="image" src="https://github.com/user-attachments/assets/62f7d179-0cd0-427c-b6e8-e087d4648f09"> #### Registration ``` ❯ llama-stack-client datasets register \ --dataset-id "mmlu" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': │ dataset │ │ │ │ 'evals__mmlu__details', 'split': │ │ │ │ │ 'train'} │ │ └────────────┴─────────────┴─────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client datasets register \ --dataset-id "simpleqa" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__simpleqa", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__mmlu__details', │ dataset │ │ │ │ 'split': 'train'} │ │ │ simpleqa │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__simpleqa', │ dataset │ │ │ │ 'split': 'train'} │ │ └────────────┴─────────────┴───────────────────────────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client eval_tasks register \ > --eval-task-id meta-reference-mmlu \ > --provider-id meta-reference \ > --dataset-id mmlu \ > --scoring-functions basic::regex_parser_multiple_choice_answer ❯ llama-stack-client eval_tasks register \ --eval-task-id meta-reference-simpleqa \ --provider-id meta-reference \ --dataset-id simpleqa \ --scoring-functions llm-as-judge::405b-simpleqa ❯ llama-stack-client eval_tasks list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ dataset_id ┃ identifier ┃ metadata ┃ provider_id ┃ provider_resour… ┃ scoring_functio… ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ mmlu │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['basic::regex_… │ eval_task │ │ simpleqa │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['llm-as-judge:… │ eval_task │ └────────────┴──────────────────┴──────────┴────────────────┴──────────────────┴──────────────────┴───────────┘ ``` #### Test with UI ``` streamlit run app.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
2025-12-03 09:53:45 +00:00 · 2024-12-05 16:29:32 -08:00 · 2024-12-05 16:29:32 -08:00 · 7301403ce3
commit 7301403ce3
parent a4daf4d3ec
47 changed files with 841 additions and 195 deletions
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,10 +1,12 @@
 {
-  "tgi": [
+  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -13,6 +15,7 @@
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -27,6 +30,66 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "together",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "vllm-gpu": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
@ -54,18 +117,22 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "vllm-gpu": [
+  "fireworks": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -77,82 +144,17 @@
    "tqdm",
    "transformers",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "meta-reference-quantized-gpu": [
+  "tgi": [
    "accelerate",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "torch",
    "torchao==0.5.0",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "torch",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -161,61 +163,7 @@
    "matplotlib",
    "nltk",
    "numpy",
-    "pandas",
+    "openai",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "together",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "ollama",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -232,10 +180,12 @@
  ],
  "bedrock": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "boto3",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -243,6 +193,7 @@
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -257,20 +208,24 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "hf-endpoint": [
+  "meta-reference-gpu": [
-    "aiohttp",
+    "accelerate",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
-    "huggingface_hub",
+    "lm-format-enforcer",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -279,25 +234,34 @@
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "torch",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "fireworks": [
+  "meta-reference-quantized-gpu": [
    "accelerate",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "fireworks-ai",
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
@ -306,9 +270,13 @@
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "torch",
    "torchao==0.5.0",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
@ -337,5 +305,67 @@
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "ollama",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "hf-endpoint": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -1,6 +1,3 @@
 ---
 orphan: true
 ---
 # Bedrock Distribution
 ```{toctree}
@ -15,9 +12,12 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::bedrock` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `remote::bedrock` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -15,9 +15,12 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::fireworks` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `inline::meta-reference` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `inline::meta-reference-quantized` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -15,9 +15,12 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
@ -119,7 +122,7 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration
 ```{note}
-Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) variable for supported Ollama models.
+Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
 ```
 To serve a new model with `ollama`
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -16,9 +16,12 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::tgi` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -15,9 +15,12 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::together` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@ -1,16 +1,41 @@
-# LLama Stack UI
+# (Experimental) LLama Stack UI
-[!NOTE] This is a work in progress.
+## Docker Setup
-## Prerequisite
+:warning: This is a work in progress.
 - Start up Llama Stack Server
 ```
 llama stack run
 ```
-## Running Streamlit App
+## Developer Setup
 1. Start up Llama Stack API server. More details [here](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
 ```
 llama stack build --template together --image-type conda
 llama stack run together
 ```
 2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page).
 ```bash
 $ llama-stack-client datasets register \
 --dataset-id "mmlu" \
 --provider-id "huggingface" \
 --url "https://huggingface.co/datasets/llamastack/evals" \
 --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
 --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}'
 ```
 ```bash
 $ llama-stack-client eval_tasks register \
 --eval-task-id meta-reference-mmlu \
 --provider-id meta-reference \
 --dataset-id mmlu \
 --scoring-functions basic::regex_parser_multiple_choice_answer
 ```
 3. Start Streamlit UI
 ```bash
 cd llama_stack/distribution/ui
 pip install -r requirements.txt
 streamlit run app.py
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@ -6,6 +6,9 @@
 from pathlib import Path
 from llama_stack.distribution.datatypes import Provider
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -16,10 +19,19 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["remote::bedrock"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
    name = "bedrock"
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    return DistributionTemplate(
-        name="bedrock",
+        name=name,
        distro_type="self_hosted",
        description="Use AWS Bedrock for running LLM inference and safety",
        docker_image=None,
@ -27,7 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
        providers=providers,
        default_models=[],
        run_configs={
-            "run.yaml": RunConfigSettings(),
+            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "memory": [memory_provider],
                },
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: bedrock
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -37,6 +40,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@ -9,6 +9,7 @@ from pathlib import Path
 from llama_models.sku_list import all_registered_models
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
 from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
@ -22,13 +23,23 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
    name = "fireworks"
    inference_provider = Provider(
        provider_id="fireworks",
        provider_type="remote::fireworks",
        config=FireworksImplConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    core_model_to_hf_repo = {
        m.descriptor(): m.huggingface_repo for m in all_registered_models()
@ -42,7 +53,7 @@ def get_distribution_template() -> DistributionTemplate:
    ]
    return DistributionTemplate(
-        name="fireworks",
+        name=name,
        distro_type="self_hosted",
        description="Use Fireworks.AI for running LLM inference",
        docker_image=None,
@ -53,6 +64,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=default_models,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: fireworks
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -39,6 +42,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "hf-endpoint"
    inference_provider = Provider(
        provider_id="hf-endpoint",
        provider_type="remote::hf::endpoint",
        config=InferenceEndpointImplConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -34,7 +43,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="hf-endpoint",
+        name=name,
        distro_type="self_hosted",
        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
        docker_image=None,
@ -45,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -59,7 +69,8 @@ def get_distribution_template() -> DistributionTemplate:
                                endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
                            ),
                        ),
-                    ]
+                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: hf-endpoint
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -44,6 +47,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: hf-endpoint
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -39,6 +42,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -16,13 +17,22 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
    name = "hf-serverless"
    inference_provider = Provider(
        provider_id="hf-serverless",
        provider_type="remote::hf::serverless",
        config=InferenceAPIImplConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -34,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="hf-serverless",
+        name=name,
        distro_type="self_hosted",
        description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
        docker_image=None,
@ -45,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -59,7 +70,8 @@ def get_distribution_template() -> DistributionTemplate:
                                repo="${env.SAFETY_MODEL}",
                            ),
                        ),
-                    ]
+                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: hf-serverless
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -44,6 +47,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: hf-serverless
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -39,6 +42,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/meta-reference-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.inference.meta_reference import (
    MetaReferenceInferenceConfig,
 )
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "meta-reference-gpu"
    inference_provider = Provider(
        provider_id="meta-reference-inference",
        provider_type="inline::meta-reference",
@ -30,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate:
            checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
        ),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -41,7 +50,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="meta-reference-gpu",
+        name=name,
        distro_type="self_hosted",
        description="Use Meta Reference for running LLM inference",
        template_path=Path(__file__).parent / "doc_template.md",
@ -51,6 +60,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -67,6 +77,7 @@ def get_distribution_template() -> DistributionTemplate:
                            ),
                        ),
                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: meta-reference-gpu
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -46,6 +49,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: meta-reference-gpu
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -40,6 +43,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider
 from llama_stack.providers.inline.inference.meta_reference import (
    MetaReferenceQuantizedInferenceConfig,
 )
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "meta-reference-quantized-gpu"
    inference_provider = Provider(
        provider_id="meta-reference-inference",
        provider_type="inline::meta-reference-quantized",
@ -30,13 +34,18 @@ def get_distribution_template() -> DistributionTemplate:
            checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
        ),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="meta-reference-inference",
    )
    return DistributionTemplate(
-        name="meta-reference-quantized-gpu",
+        name=name,
        distro_type="self_hosted",
        description="Use Meta Reference with fp8, int4 quantization for running LLM inference",
        template_path=Path(__file__).parent / "doc_template.md",
@ -46,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: meta-reference-quantized-gpu
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -42,6 +45,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -114,9 +114,9 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration
-> [!NOTE]
+```{note}
-> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
+Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
-
+```
 To serve a new model with `ollama`
 ```bash
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -7,6 +7,7 @@
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -18,13 +19,21 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "ollama"
    inference_provider = Provider(
        provider_id="ollama",
        provider_type="remote::ollama",
        config=OllamaImplConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -36,7 +45,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="ollama",
+        name=name,
        distro_type="self_hosted",
        description="Use (an external) Ollama server for running LLM inference",
        docker_image=None,
@ -47,6 +56,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -54,7 +64,8 @@ def get_distribution_template() -> DistributionTemplate:
                provider_overrides={
                    "inference": [
                        inference_provider,
-                    ]
+                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: ollama
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -38,6 +41,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: ollama
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -38,6 +41,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -7,6 +7,7 @@
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -19,7 +20,7 @@ def get_distribution_template() -> DistributionTemplate:
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
-
+    name = "remote-vllm"
    inference_provider = Provider(
        provider_id="vllm-inference",
        provider_type="remote::vllm",
@ -27,6 +28,11 @@ def get_distribution_template() -> DistributionTemplate:
            url="${env.VLLM_URL}",
        ),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -38,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="remote-vllm",
+        name=name,
        distro_type="self_hosted",
        description="Use (an external) vLLM server for running LLM inference",
        template_path=Path(__file__).parent / "doc_template.md",
@ -48,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -63,6 +70,7 @@ def get_distribution_template() -> DistributionTemplate:
                            ),
                        ),
                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -44,36 +44,37 @@ class RunConfigSettings(BaseModel):
                provider_configs[api_str] = api_providers
                continue
-            provider_type = provider_types[0]
+            provider_configs[api_str] = []
-            provider_id = provider_type.split("::")[-1]
+            for provider_type in provider_types:
                provider_id = provider_type.split("::")[-1]
-            api = Api(api_str)
+                api = Api(api_str)
-            if provider_type not in provider_registry[api]:
+                if provider_type not in provider_registry[api]:
-                raise ValueError(
+                    raise ValueError(
-                    f"Unknown provider type: {provider_type} for API: {api_str}"
+                        f"Unknown provider type: {provider_type} for API: {api_str}"
                    )
                config_class = provider_registry[api][provider_type].config_class
                assert (
                    config_class is not None
                ), f"No config class for provider type: {provider_type} for API: {api_str}"
                config_class = instantiate_class_type(config_class)
                if hasattr(config_class, "sample_run_config"):
                    config = config_class.sample_run_config(
                        __distro_dir__=f"distributions/{name}"
                    )
                else:
                    config = {}
                provider_configs[api_str].append(
                    Provider(
                        provider_id=provider_id,
                        provider_type=provider_type,
                        config=config,
                    )
                )
            config_class = provider_registry[api][provider_type].config_class
            assert (
                config_class is not None
            ), f"No config class for provider type: {provider_type} for API: {api_str}"
            config_class = instantiate_class_type(config_class)
            if hasattr(config_class, "sample_run_config"):
                config = config_class.sample_run_config(
                    __distro_dir__=f"distributions/{name}"
                )
            else:
                config = {}
            provider_configs[api_str] = [
                Provider(
                    provider_id=provider_id,
                    provider_type=provider_type,
                    config=config,
                )
            ]
        # Get unique set of APIs from providers
        apis = list(sorted(providers.keys()))
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: tgi
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -42,6 +45,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: tgi
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -38,6 +41,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -7,6 +7,7 @@
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.tgi import TGIImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -18,8 +19,11 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "tgi"
    inference_provider = Provider(
        provider_id="tgi-inference",
        provider_type="remote::tgi",
@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate:
            url="${env.TGI_URL}",
        ),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -38,7 +47,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="tgi",
+        name=name,
        distro_type="self_hosted",
        description="Use (an external) TGI server for running LLM inference",
        docker_image=None,
@ -49,6 +58,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),
@ -64,6 +74,7 @@ def get_distribution_template() -> DistributionTemplate:
                            ),
                        ),
                    ],
                    "memory": [memory_provider],
                },
                default_models=[
                    inference_model,
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: together
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -39,6 +42,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@ -9,6 +9,7 @@ from pathlib import Path
 from llama_models.sku_list import all_registered_models
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.providers.remote.inference.together import TogetherImplConfig
 from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
@ -22,13 +23,21 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "together"
    inference_provider = Provider(
        provider_id="together",
        provider_type="remote::together",
        config=TogetherImplConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    core_model_to_hf_repo = {
        m.descriptor(): m.huggingface_repo for m in all_registered_models()
@ -42,7 +51,7 @@ def get_distribution_template() -> DistributionTemplate:
    ]
    return DistributionTemplate(
-        name="together",
+        name=name,
        distro_type="self_hosted",
        description="Use Together.AI for running LLM inference",
        docker_image=None,
@ -53,6 +62,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=default_models,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@ -16,4 +16,13 @@ distribution_spec:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
 image_type: conda
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -4,9 +4,12 @@ docker_image: null
 conda_env: vllm-gpu
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 providers:
  inference:
@ -42,6 +45,27 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config: {}
  - provider_id: localfs
    provider_type: inline::localfs
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@ -6,6 +6,7 @@
 from llama_stack.distribution.datatypes import ModelInput, Provider
 from llama_stack.providers.inline.inference.vllm import VLLMConfig
 from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
    }
-
+    name = "vllm-gpu"
    inference_provider = Provider(
        provider_id="vllm",
        provider_type="inline::vllm",
        config=VLLMConfig.sample_run_config(),
    )
    memory_provider = Provider(
        provider_id="faiss",
        provider_type="inline::faiss",
        config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
@ -30,7 +39,7 @@ def get_distribution_template() -> DistributionTemplate:
    )
    return DistributionTemplate(
-        name="vllm-gpu",
+        name=name,
        distro_type="self_hosted",
        description="Use a built-in vLLM engine for running LLM inference",
        docker_image=None,
@ -41,6 +50,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "memory": [memory_provider],
                },
                default_models=[inference_model],
            ),