diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1c85436c4..3707d4671 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: 'build'
+exclude: 'build/'
 
 default_language_version:
     python: python3
diff --git a/MANIFEST.in b/MANIFEST.in
index 7426a3abd..0517b86a8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
 include requirements.txt
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
-include distributions/*/build.yaml
+include llama_stack/templates/*/build.yaml
diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml
deleted file mode 100644
index ae7b27d49..000000000
--- a/distributions/bedrock/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: bedrock
-distribution_spec:
-  description: Use Amazon Bedrock APIs.
-  providers:
-    inference: remote::bedrock
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: conda
diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml
new file mode 120000
index 000000000..72402ef8d
--- /dev/null
+++ b/distributions/bedrock/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/bedrock/build.yaml
\ No newline at end of file
diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml
deleted file mode 100644
index 2188dd0a0..000000000
--- a/distributions/databricks/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: databricks
-distribution_spec:
-  description: Use Databricks for running LLM inference
-  providers:
-    inference: remote::databricks
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: conda
diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml
new file mode 120000
index 000000000..66342fe6f
--- /dev/null
+++ b/distributions/databricks/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/databricks/build.yaml
\ No newline at end of file
diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md
index fcf74d809..e3987e1e2 100644
--- a/distributions/fireworks/README.md
+++ b/distributions/fireworks/README.md
@@ -49,7 +49,7 @@ inference:
 **Via Conda**
 
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template fireworks --image-type conda
 # -- modify run.yaml to a valid Fireworks server endpoint
 llama stack run ./run.yaml
 ```
diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml
deleted file mode 100644
index 2e5cf0753..000000000
--- a/distributions/fireworks/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: fireworks
-distribution_spec:
-  description: Use Fireworks.ai for running LLM inference
-  providers:
-    inference: remote::fireworks
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml
new file mode 120000
index 000000000..32a5bd869
--- /dev/null
+++ b/distributions/fireworks/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/build.yaml
\ No newline at end of file
diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml
deleted file mode 100644
index 750bebcb5..000000000
--- a/distributions/hf-endpoint/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: hf-endpoint
-distribution_spec:
-  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
-  providers:
-    inference: remote::hf::endpoint
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: conda
diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml
new file mode 120000
index 000000000..a73c70c05
--- /dev/null
+++ b/distributions/hf-endpoint/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/hf-endpoint/build.yaml
\ No newline at end of file
diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml
deleted file mode 100644
index f6da3ad4d..000000000
--- a/distributions/hf-serverless/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: hf-serverless
-distribution_spec:
-  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
-  providers:
-    inference: remote::hf::serverless
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: conda
diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml
new file mode 120000
index 000000000..f2db0fd55
--- /dev/null
+++ b/distributions/hf-serverless/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/hf-serverless/build.yaml
\ No newline at end of file
diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml
deleted file mode 100644
index 5b1521a92..000000000
--- a/distributions/meta-reference-gpu/build.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: meta-reference-gpu
-distribution_spec:
-  docker_image: pytorch/pytorch
-  description: Use code from `llama_stack` itself to serve all llama stack APIs
-  providers:
-    inference: meta-reference
-    memory:
-    - meta-reference
-    - remote::chromadb
-    - remote::pgvector
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml
new file mode 120000
index 000000000..4418195eb
--- /dev/null
+++ b/distributions/meta-reference-gpu/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/build.yaml
\ No newline at end of file
diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml
deleted file mode 100644
index e9ddb4aad..000000000
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: meta-reference-quantized-gpu
-distribution_spec:
-  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
-  description: Use code from `llama_stack` itself to serve all llama stack APIs
-  providers:
-    inference: meta-reference-quantized
-    memory:
-    - meta-reference
-    - remote::chromadb
-    - remote::pgvector
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml
new file mode 120000
index 000000000..f3dbe996f
--- /dev/null
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
\ No newline at end of file
diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md
index d59c3f9e1..70bc27a85 100644
--- a/distributions/ollama/README.md
+++ b/distributions/ollama/README.md
@@ -86,6 +86,6 @@ inference:
 **Via Conda**
 
 ```
-llama stack build --config ./build.yaml
+llama stack build --template ollama --image-type conda
 llama stack run ./gpu/run.yaml
 ```
diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml
deleted file mode 100644
index c27f40929..000000000
--- a/distributions/ollama/build.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: ollama
-distribution_spec:
-  description: Use ollama for running LLM inference
-  providers:
-    inference: remote::ollama
-    memory:
-    - meta-reference
-    - remote::chromadb
-    - remote::pgvector
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml
new file mode 120000
index 000000000..8772548e0
--- /dev/null
+++ b/distributions/ollama/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/ollama/build.yaml
\ No newline at end of file
diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md
index 86d2636d7..886252ecd 100644
--- a/distributions/tgi/README.md
+++ b/distributions/tgi/README.md
@@ -88,7 +88,7 @@ inference:
 **Via Conda**
 
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template tgi --image-type conda
 # -- start a TGI server endpoint
 llama stack run ./gpu/run.yaml
 ```
diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml
deleted file mode 100644
index 2c0ca1d33..000000000
--- a/distributions/tgi/build.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: tgi
-distribution_spec:
-  description: Use TGI for running LLM inference
-  providers:
-    inference: remote::tgi
-    memory:
-    - meta-reference
-    - remote::chromadb
-    - remote::pgvector
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml
new file mode 120000
index 000000000..73e59ad84
--- /dev/null
+++ b/distributions/tgi/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/tgi/build.yaml
\ No newline at end of file
diff --git a/distributions/together/README.md b/distributions/together/README.md
index 227c7a450..b964673e0 100644
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@@ -62,7 +62,7 @@ memory:
 **Via Conda**
 
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template together --image-type conda
 # -- modify run.yaml to a valid Together server endpoint
 llama stack run ./run.yaml
 ```
diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml
deleted file mode 100644
index 49eab859d..000000000
--- a/distributions/together/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: together
-distribution_spec:
-  description: Use Together.ai for running LLM inference
-  providers:
-    inference: remote::together
-    memory: remote::weaviate
-    safety: remote::together
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: docker
diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml
new file mode 120000
index 000000000..3877a9c96
--- /dev/null
+++ b/distributions/together/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/together/build.yaml
\ No newline at end of file
diff --git a/distributions/vllm/build.yaml b/distributions/vllm/build.yaml
deleted file mode 100644
index f41352eb1..000000000
--- a/distributions/vllm/build.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: vllm
-distribution_spec:
-  description: Like local, but use vLLM for running LLM inference
-  providers:
-    inference: vllm
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
-image_type: conda
\ No newline at end of file
diff --git a/distributions/vllm/build.yaml b/distributions/vllm/build.yaml
new file mode 120000
index 000000000..dfc9401b6
--- /dev/null
+++ b/distributions/vllm/build.yaml
@@ -0,0 +1 @@
+../../llama_stack/templates/vllm/build.yaml
\ No newline at end of file
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
index f0f67192f..ddc8e6b3e 100644
--- a/docs/cli_reference.md
+++ b/docs/cli_reference.md
@@ -279,11 +279,11 @@ llama stack build --list-templates
 You may then pick a template to build your distribution with providers fitted to your liking.
 
 ```
-llama stack build --template local-tgi --name my-tgi-stack
+llama stack build --template local-tgi --name my-tgi-stack --image-type conda
 ```
 
 ```
-$ llama stack build --template local-tgi --name my-tgi-stack
+$ llama stack build --template local-tgi --name my-tgi-stack --image-type conda
 ...
 ...
 Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
@@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~
 #### Building from config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
 
-- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
+- The config file will be of contents like the ones in `llama_stack/templates/`.
 
 ```
-$ cat llama_stack/distribution/templates/local-ollama-build.yaml
+$ cat build.yaml
 
 name: local-ollama
 distribution_spec:
@@ -311,7 +311,7 @@ image_type: conda
 ```
 
 ```
-llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
+llama stack build --config build.yaml
 ```
 
 #### How to build distribution with Docker image
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 4f06f5d47..2a90301d0 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -35,11 +35,7 @@ You have two ways to start up Llama stack server:
 
 1. **Starting up server via docker**:
 
-	We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
-	- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
-	- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
-	- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
-	- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
+	We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder.
 
 	> [!NOTE]
 	> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py
index adac34d55..1fd523dcb 100644
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@@ -37,7 +37,7 @@ class ScoreResponse(BaseModel):
 
 
 class ScoringFunctionStore(Protocol):
-    def get_scoring_function(self, name: str) -> ScoringFunctionDefWithProvider: ...
+    def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ...
 
 
 @runtime_checkable
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index a242215c6..fc3584f90 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -29,7 +29,7 @@ class LLMAsJudgeContext(BaseModel):
 
 
 @json_schema_type
-class ScoringFunctionDef(BaseModel):
+class ScoringFnDef(BaseModel):
     identifier: str
     description: Optional[str] = None
     metadata: Dict[str, Any] = Field(
@@ -48,7 +48,7 @@ class ScoringFunctionDef(BaseModel):
 
 
 @json_schema_type
-class ScoringFunctionDefWithProvider(ScoringFunctionDef):
+class ScoringFnDefWithProvider(ScoringFnDef):
     provider_id: str = Field(
         description="ID of the provider which serves this dataset",
     )
@@ -57,14 +57,14 @@ class ScoringFunctionDefWithProvider(ScoringFunctionDef):
 @runtime_checkable
 class ScoringFunctions(Protocol):
     @webmethod(route="/scoring_functions/list", method="GET")
-    async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: ...
+    async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: ...
 
     @webmethod(route="/scoring_functions/get", method="GET")
     async def get_scoring_function(
         self, name: str
-    ) -> Optional[ScoringFunctionDefWithProvider]: ...
+    ) -> Optional[ScoringFnDefWithProvider]: ...
 
     @webmethod(route="/scoring_functions/register", method="POST")
     async def register_scoring_function(
-        self, function_def: ScoringFunctionDefWithProvider
+        self, function_def: ScoringFnDefWithProvider
     ) -> None: ...
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index 26aa35e16..40fca4c6d 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -12,9 +12,7 @@ import os
 from functools import lru_cache
 from pathlib import Path
 
-TEMPLATES_PATH = (
-    Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
-)
+TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates"
 
 
 @lru_cache()
@@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]:
         with open(p, "r") as f:
             build_config = BuildConfig(**yaml.safe_load(f))
             template_specs.append(build_config)
-
     return template_specs
 
 
@@ -78,112 +75,17 @@ class StackBuild(Subcommand):
             choices=["conda", "docker"],
         )
 
-    def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
-        if os.getenv("CONDA_PREFIX", ""):
-            conda_dir = (
-                Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
-            )
-        else:
-            cprint(
-                "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
-                color="green",
-            )
-            conda_dir = (
-                Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
-            )
-        build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
-        if build_config_file.exists():
-            return build_config_file
-
-        return None
-
-    def _run_stack_build_command_from_build_config(
-        self, build_config: BuildConfig
-    ) -> None:
-        import json
-        import os
-
-        import yaml
-
-        from llama_stack.distribution.build import build_image, ImageType
-        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
-        from llama_stack.distribution.utils.serialize import EnumEncoder
-        from termcolor import cprint
-
-        # save build.yaml spec for building same distribution again
-        if build_config.image_type == ImageType.docker.value:
-            # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
-            llama_stack_path = Path(
-                os.path.abspath(__file__)
-            ).parent.parent.parent.parent
-            build_dir = llama_stack_path / "tmp/configs/"
-        else:
-            build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
-
-        os.makedirs(build_dir, exist_ok=True)
-        build_file_path = build_dir / f"{build_config.name}-build.yaml"
-
-        with open(build_file_path, "w") as f:
-            to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
-            f.write(yaml.dump(to_write, sort_keys=False))
-
-        return_code = build_image(build_config, build_file_path)
-        if return_code != 0:
-            return
-
-        configure_name = (
-            build_config.name
-            if build_config.image_type == "conda"
-            else (f"llamastack-{build_config.name}")
-        )
-        if build_config.image_type == "conda":
-            cprint(
-                f"You can now run `llama stack configure {configure_name}`",
-                color="green",
-            )
-        else:
-            cprint(
-                f"You can now run `llama stack run {build_config.name}`",
-                color="green",
-            )
-
-    def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
-        import json
-
-        from llama_stack.cli.table import print_table
-
-        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
-        headers = [
-            "Template Name",
-            "Providers",
-            "Description",
-        ]
-
-        rows = []
-        for spec in available_templates_specs():
-            rows.append(
-                [
-                    spec.name,
-                    json.dumps(spec.distribution_spec.providers, indent=2),
-                    spec.distribution_spec.description,
-                ]
-            )
-        print_table(
-            rows,
-            headers,
-            separate_rows=True,
-        )
-
     def _run_stack_build_command(self, args: argparse.Namespace) -> None:
         import textwrap
 
         import yaml
-        from llama_stack.distribution.distribution import get_provider_registry
         from prompt_toolkit import prompt
         from prompt_toolkit.completion import WordCompleter
         from prompt_toolkit.validation import Validator
         from termcolor import cprint
 
+        from llama_stack.distribution.distribution import get_provider_registry
+
         if args.list_templates:
             self._run_template_list_cmd(args)
             return
@@ -194,19 +96,22 @@ class StackBuild(Subcommand):
                     "You must specify a name for the build using --name when using a template"
                 )
                 return
-            build_path = TEMPLATES_PATH / f"{args.template}-build.yaml"
-            if not build_path.exists():
-                self.parser.error(
-                    f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
-                )
-                return
-            with open(build_path, "r") as f:
-                build_config = BuildConfig(**yaml.safe_load(f))
-                build_config.name = args.name
-                if args.image_type:
-                    build_config.image_type = args.image_type
-                self._run_stack_build_command_from_build_config(build_config)
+            available_templates = available_templates_specs()
+            for build_config in available_templates:
+                if build_config.name == args.template:
+                    build_config.name = args.name
+                    if args.image_type:
+                        build_config.image_type = args.image_type
+                    else:
+                        self.parser.error(
+                            f"Please specify a image-type (docker | conda) for {args.template}"
+                        )
+                    self._run_stack_build_command_from_build_config(build_config)
+                    return
 
+            self.parser.error(
+                f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
+            )
             return
 
         # try to see if we can find a pre-existing build config file through name
@@ -297,3 +202,99 @@ class StackBuild(Subcommand):
                 self.parser.error(f"Could not parse config file {args.config}: {e}")
                 return
             self._run_stack_build_command_from_build_config(build_config)
+
+    def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
+        if os.getenv("CONDA_PREFIX", ""):
+            conda_dir = (
+                Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
+            )
+        else:
+            cprint(
+                "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
+                color="green",
+            )
+            conda_dir = (
+                Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
+            )
+        build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
+        if build_config_file.exists():
+            return build_config_file
+
+        return None
+
+    def _run_stack_build_command_from_build_config(
+        self, build_config: BuildConfig
+    ) -> None:
+        import json
+        import os
+
+        import yaml
+        from termcolor import cprint
+
+        from llama_stack.distribution.build import build_image, ImageType
+        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+        from llama_stack.distribution.utils.serialize import EnumEncoder
+
+        # save build.yaml spec for building same distribution again
+        if build_config.image_type == ImageType.docker.value:
+            # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
+            llama_stack_path = Path(
+                os.path.abspath(__file__)
+            ).parent.parent.parent.parent
+            build_dir = llama_stack_path / "tmp/configs/"
+        else:
+            build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
+
+        os.makedirs(build_dir, exist_ok=True)
+        build_file_path = build_dir / f"{build_config.name}-build.yaml"
+
+        with open(build_file_path, "w") as f:
+            to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
+            f.write(yaml.dump(to_write, sort_keys=False))
+
+        return_code = build_image(build_config, build_file_path)
+        if return_code != 0:
+            return
+
+        configure_name = (
+            build_config.name
+            if build_config.image_type == "conda"
+            else (f"llamastack-{build_config.name}")
+        )
+        if build_config.image_type == "conda":
+            cprint(
+                f"You can now run `llama stack configure {configure_name}`",
+                color="green",
+            )
+        else:
+            cprint(
+                f"You can now edit your run.yaml file and run `docker run -it -p 5000:5000 {build_config.name}`. See full command in llama-stack/distributions/",
+                color="green",
+            )
+
+    def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
+        import json
+
+        from llama_stack.cli.table import print_table
+
+        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
+        headers = [
+            "Template Name",
+            "Providers",
+            "Description",
+        ]
+
+        rows = []
+        for spec in available_templates_specs():
+            rows.append(
+                [
+                    spec.name,
+                    json.dumps(spec.distribution_spec.providers, indent=2),
+                    spec.distribution_spec.description,
+                ]
+            )
+        print_table(
+            rows,
+            headers,
+            separate_rows=True,
+        )
diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py
index 13c545723..e3a9d9186 100644
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@@ -8,18 +8,19 @@ from enum import Enum
 from typing import List, Optional
 
 import pkg_resources
-
-from llama_stack.distribution.utils.exec import run_with_pty
 from pydantic import BaseModel
 
 from termcolor import cprint
 
+from llama_stack.distribution.utils.exec import run_with_pty
+
 from llama_stack.distribution.datatypes import *  # noqa: F403
 from pathlib import Path
 
-from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
 from llama_stack.distribution.distribution import get_provider_registry
 
+from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
+
 
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 3bf74edcf..8044dda28 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
@@ -34,9 +40,6 @@ REPO_CONFIGS_DIR="$REPO_DIR/tmp/configs"
 
 TEMP_DIR=$(mktemp -d)
 
-llama stack configure $build_file_path
-cp $host_build_dir/$build_name-run.yaml $REPO_CONFIGS_DIR
-
 add_to_docker() {
   local input
   output_file="$TEMP_DIR/Dockerfile"
@@ -113,7 +116,6 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
 EOF
 
 add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml"
-add_to_docker "ADD tmp/configs/$build_name-run.yaml ./llamastack-run.yaml"
 
 printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
 cat $TEMP_DIR/Dockerfile
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 318809baf..9ad82cd79 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -34,7 +34,7 @@ RoutableObject = Union[
     ShieldDef,
     MemoryBankDef,
     DatasetDef,
-    ScoringFunctionDef,
+    ScoringFnDef,
 ]
 
 RoutableObjectWithProvider = Union[
@@ -42,7 +42,7 @@ RoutableObjectWithProvider = Union[
     ShieldDefWithProvider,
     MemoryBankDefWithProvider,
     DatasetDefWithProvider,
-    ScoringFunctionDefWithProvider,
+    ScoringFnDefWithProvider,
 ]
 
 RoutedProtocol = Union[
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index dcd588a9e..3e07b9162 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -100,7 +100,7 @@ class CommonRoutingTableImpl(RoutingTable):
                 scoring_functions = await p.list_scoring_functions()
                 add_objects(
                     [
-                        ScoringFunctionDefWithProvider(**s.dict(), provider_id=pid)
+                        ScoringFnDefWithProvider(**s.dict(), provider_id=pid)
                         for s in scoring_functions
                     ]
                 )
@@ -239,7 +239,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
 
 
 class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
-    async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]:
+    async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]:
         objects = []
         for objs in self.registry.values():
             objects.extend(objs)
@@ -247,10 +247,10 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
 
     async def get_scoring_function(
         self, name: str
-    ) -> Optional[ScoringFunctionDefWithProvider]:
+    ) -> Optional[ScoringFnDefWithProvider]:
         return self.get_object_by_identifier(name)
 
     async def register_scoring_function(
-        self, function_def: ScoringFunctionDefWithProvider
+        self, function_def: ScoringFnDefWithProvider
     ) -> None:
         await self.register_object(function_def)
diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh
index 8533da7d1..fe1b5051f 100755
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@@ -29,7 +29,7 @@ if [ $# -lt 3 ]; then
 fi
 
 build_name="$1"
-docker_image="llamastack-$build_name"
+docker_image="distribution-$build_name"
 shift
 
 yaml_config="$1"
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 8d476a509..eace0ea1a 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.datasets import DatasetDef
 from llama_stack.apis.memory_banks import MemoryBankDef
 from llama_stack.apis.models import ModelDef
-from llama_stack.apis.scoring_functions import ScoringFunctionDef
+from llama_stack.apis.scoring_functions import ScoringFnDef
 from llama_stack.apis.shields import ShieldDef
 
 
@@ -64,11 +64,9 @@ class DatasetsProtocolPrivate(Protocol):
 
 
 class ScoringFunctionsProtocolPrivate(Protocol):
-    async def list_scoring_functions(self) -> List[ScoringFunctionDef]: ...
+    async def list_scoring_functions(self) -> List[ScoringFnDef]: ...
 
-    async def register_scoring_function(
-        self, function_def: ScoringFunctionDef
-    ) -> None: ...
+    async def register_scoring_function(self, function_def: ScoringFnDef) -> None: ...
 
 
 @json_schema_type
diff --git a/llama_stack/providers/impls/meta_reference/agents/agents.py b/llama_stack/providers/impls/meta_reference/agents/agents.py
index ca5a00359..13d9044fd 100644
--- a/llama_stack/providers/impls/meta_reference/agents/agents.py
+++ b/llama_stack/providers/impls/meta_reference/agents/agents.py
@@ -169,7 +169,7 @@ class MetaReferenceAgentsImpl(Agents):
         turn_ids: Optional[List[str]] = None,
     ) -> Session:
         session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
-        session = Session(**json.loads(session))
+        session = Session(**json.loads(session), turns=[])
         turns = []
         if turn_ids:
             for turn_id in turn_ids:
diff --git a/llama_stack/providers/impls/meta_reference/eval/eval.py b/llama_stack/providers/impls/meta_reference/eval/eval.py
index daa17a89e..d675e40eb 100644
--- a/llama_stack/providers/impls/meta_reference/eval/eval.py
+++ b/llama_stack/providers/impls/meta_reference/eval/eval.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from enum import Enum
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 
 from llama_stack.apis.common.type_system import *  # noqa: F403
@@ -16,6 +17,13 @@ from llama_stack.apis.scoring import Scoring
 from .config import MetaReferenceEvalConfig
 
 
+class ColumnName(Enum):
+    expected_answer = "expected_answer"
+    chat_completion_input = "chat_completion_input"
+    completion_input = "completion_input"
+    generated_answer = "generated_answer"
+
+
 class MetaReferenceEvalImpl(Eval):
     def __init__(
         self,
@@ -41,18 +49,16 @@ class MetaReferenceEvalImpl(Eval):
     async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
         dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
         if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
-            raise ValueError(
-                f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
-            )
+            raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
 
         expected_schemas = [
             {
-                "expected_answer": StringType(),
-                "chat_completion_input": ChatCompletionInputType(),
+                ColumnName.expected_answer.value: StringType(),
+                ColumnName.chat_completion_input.value: ChatCompletionInputType(),
             },
             {
-                "expected_answer": StringType(),
-                "chat_completion_input": CompletionInputType(),
+                ColumnName.expected_answer.value: StringType(),
+                ColumnName.completion_input.value: CompletionInputType(),
             },
         ]
 
@@ -94,27 +100,43 @@ class MetaReferenceEvalImpl(Eval):
             raise NotImplementedError(
                 "Evaluation with generation has not been implemented for agents"
             )
+        assert (
+            candidate.sampling_params.max_tokens is not None
+        ), "SamplingParams.max_tokens must be provided"
+
         generations = []
         for x in input_rows:
-            if "completion_input" in x:
-                raise NotImplementedError(
-                    "Evaluation with completion API has not been implemented"
+            if ColumnName.completion_input.value in x:
+                input_content = eval(str(x[ColumnName.completion_input.value]))
+                response = await self.inference_api.completion(
+                    model=candidate.model,
+                    content=input_content,
+                    sampling_params=candidate.sampling_params,
                 )
-
-            input_messages = eval(str(x["chat_completion_input"]))
-            input_messages = [UserMessage(**x) for x in input_messages]
-            messages = []
-            if candidate.system_message:
-                messages.append(candidate.system_message)
-            messages += input_messages
-            response = await self.inference_api.chat_completion(
-                model=candidate.model,
-                messages=messages,
-                sampling_params=candidate.sampling_params,
-            )
-            generations.append(
-                {"generated_answer": response.completion_message.content}
-            )
+                generations.append(
+                    {
+                        ColumnName.generated_answer.value: response.completion_message.content
+                    }
+                )
+            elif ColumnName.chat_completion_input.value in x:
+                input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
+                input_messages = [UserMessage(**x) for x in input_messages]
+                messages = []
+                if candidate.system_message:
+                    messages.append(candidate.system_message)
+                messages += input_messages
+                response = await self.inference_api.chat_completion(
+                    model=candidate.model,
+                    messages=messages,
+                    sampling_params=candidate.sampling_params,
+                )
+                generations.append(
+                    {
+                        ColumnName.generated_answer.value: response.completion_message.content
+                    }
+                )
+            else:
+                raise ValueError("Invalid input row")
 
         # scoring with generated_answer
         score_input_rows = [
@@ -132,6 +154,8 @@ class MetaReferenceEvalImpl(Eval):
         if job_id in self.jobs:
             return JobStatus.completed
 
+        return None
+
     async def job_cancel(self, job_id: str) -> None:
         raise NotImplementedError("Job cancel is not implemented yet")
 
diff --git a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
index d3028f8e8..ae0ed0bac 100644
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
 if [[ $# -ne 1 ]]; then
     echo "Error: Please provide the name of CONDA environment you wish to create"
     exit 1
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scoring.py b/llama_stack/providers/impls/meta_reference/scoring/scoring.py
index 05ace33b4..b1d561533 100644
--- a/llama_stack/providers/impls/meta_reference/scoring/scoring.py
+++ b/llama_stack/providers/impls/meta_reference/scoring/scoring.py
@@ -13,22 +13,22 @@ from llama_stack.apis.datasetio import *  # noqa: F403
 from llama_stack.apis.datasets import *  # noqa: F403
 
 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
-from llama_stack.providers.impls.meta_reference.scoring.scorer.equality_scorer import (
-    EqualityScorer,
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.equality_scoring_fn import (
+    EqualityScoringFn,
 )
 
-from llama_stack.providers.impls.meta_reference.scoring.scorer.subset_of_scorer import (
-    SubsetOfScorer,
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.subset_of_scoring_fn import (
+    SubsetOfScoringFn,
 )
 
 from .config import MetaReferenceScoringConfig
 
-SUPPORTED_SCORERS = [
-    EqualityScorer,
-    SubsetOfScorer,
+SUPPORTED_SCORING_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
 ]
 
-SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORERS}
+SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORING_FNS}
 
 
 class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
@@ -46,10 +46,10 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
 
     async def shutdown(self) -> None: ...
 
-    async def list_scoring_functions(self) -> List[ScoringFunctionDef]:
-        return [x.scoring_function_def for x in SUPPORTED_SCORERS]
+    async def list_scoring_functions(self) -> List[ScoringFnDef]:
+        return [x.scoring_function_def for x in SUPPORTED_SCORING_FNS]
 
-    async def register_scoring_function(self, function_def: ScoringFunctionDef) -> None:
+    async def register_scoring_function(self, function_def: ScoringFnDef) -> None:
         raise NotImplementedError(
             "Dynamically registering scoring functions is not supported"
         )
@@ -101,9 +101,9 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
         for scoring_fn_id in scoring_functions:
             if scoring_fn_id not in SCORER_REGISTRY:
                 raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-            scorer = SCORER_REGISTRY[scoring_fn_id]()
-            score_results = scorer.score(input_rows)
-            agg_results = scorer.aggregate(score_results)
+            scoring_fn = SCORER_REGISTRY[scoring_fn_id]()
+            score_results = scoring_fn.score(input_rows)
+            agg_results = scoring_fn.aggregate(score_results)
             res[scoring_fn_id] = ScoringResult(
                 score_rows=score_results,
                 aggregated_results=agg_results,
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py
similarity index 100%
rename from llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py
rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py
similarity index 81%
rename from llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py
rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py
index ea8a3f063..952d46bb2 100644
--- a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py
+++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py
@@ -9,15 +9,15 @@ from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 
 
-class BaseScorer(ABC):
+class BaseScoringFn(ABC):
     """
-    Base interface class for all meta-reference scorers.
-    Each scorer needs to implement the following methods:
+    Base interface class for all meta-reference scoring_fns.
+    Each scoring_fn needs to implement the following methods:
     - score_row(self, row)
-    - aggregate(self, scorer_results)
+    - aggregate(self, scoring_fn_results)
     """
 
-    scoring_function_def: ScoringFunctionDef
+    scoring_function_def: ScoringFnDef
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/common.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py
similarity index 100%
rename from llama_stack/providers/impls/meta_reference/scoring/scorer/common.py
rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
similarity index 76%
rename from llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py
rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
index 0c7751f35..cce0f948a 100644
--- a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py
+++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
@@ -4,23 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import (
-    BaseScorer,
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import (
+    BaseScoringFn,
 )
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.providers.impls.meta_reference.scoring.scorer.common import (
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
     aggregate_accuracy,
 )
 
 
-class EqualityScorer(BaseScorer):
+class EqualityScoringFn(BaseScoringFn):
     """
-    A scorer that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
+    A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
     """
 
-    scoring_function_def = ScoringFunctionDef(
+    scoring_function_def = ScoringFnDef(
         identifier="equality",
         description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
         parameters=[],
diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
similarity index 76%
rename from llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py
rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
index e72b5ed0f..c7ee68e26 100644
--- a/llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py
+++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
@@ -4,23 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import (
-    BaseScorer,
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import (
+    BaseScoringFn,
 )
 from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.providers.impls.meta_reference.scoring.scorer.common import (
+from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
     aggregate_accuracy,
 )
 
 
-class SubsetOfScorer(BaseScorer):
+class SubsetOfScoringFn(BaseScoringFn):
     """
-    A scorer that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
+    A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
     """
 
-    scoring_function_def = ScoringFunctionDef(
+    scoring_function_def = ScoringFnDef(
         identifier="subset_of",
         description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
         parameters=[],
diff --git a/llama_stack/providers/impls/vllm/config.py b/llama_stack/providers/impls/vllm/config.py
index df2526f2e..a7469ebde 100644
--- a/llama_stack/providers/impls/vllm/config.py
+++ b/llama_stack/providers/impls/vllm/config.py
@@ -15,13 +15,24 @@ class VLLMConfig(BaseModel):
     """Configuration for the vLLM inference provider."""
 
     model: str = Field(
-        default="Llama3.1-8B-Instruct",
+        default="Llama3.2-3B-Instruct",
         description="Model descriptor from `llama model list`",
     )
     tensor_parallel_size: int = Field(
         default=1,
         description="Number of tensor parallel replicas (number of GPUs to use).",
     )
+    max_tokens: int = Field(
+        default=4096,
+        description="Maximum number of tokens to generate.",
+    )
+    enforce_eager: bool = Field(
+        default=False,
+        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
+    )
+    gpu_memory_utilization: float = Field(
+        default=0.3,
+    )
 
     @field_validator("model")
     @classmethod
diff --git a/llama_stack/providers/impls/vllm/vllm.py b/llama_stack/providers/impls/vllm/vllm.py
index ad3ad8fb7..cf5b0572b 100644
--- a/llama_stack/providers/impls/vllm/vllm.py
+++ b/llama_stack/providers/impls/vllm/vllm.py
@@ -7,11 +7,12 @@
 import logging
 import os
 import uuid
-from typing import Any, AsyncGenerator
+from typing import AsyncGenerator, Optional
 
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams
 
 from llama_stack.apis.inference import *  # noqa: F403
 
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
@@ -40,74 +41,15 @@ def _random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams:
-    """Convert sampling params to vLLM sampling params."""
-    if sampling_params is None:
-        return VLLMSamplingParams()
-
-    # TODO convert what I saw in my first test ... but surely there's more to do here
-    kwargs = {
-        "temperature": sampling_params.temperature,
-    }
-    if sampling_params.top_k >= 1:
-        kwargs["top_k"] = sampling_params.top_k
-    if sampling_params.top_p:
-        kwargs["top_p"] = sampling_params.top_p
-    if sampling_params.max_tokens >= 1:
-        kwargs["max_tokens"] = sampling_params.max_tokens
-    if sampling_params.repetition_penalty > 0:
-        kwargs["repetition_penalty"] = sampling_params.repetition_penalty
-
-    return VLLMSamplingParams(**kwargs)
-
-
-class VLLMInferenceImpl(ModelRegistryHelper, Inference):
+class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
     """Inference implementation for vLLM."""
 
-    HF_MODEL_MAPPINGS = {
-        # TODO: seems like we should be able to build this table dynamically ...
-        "Llama3.1-8B": "meta-llama/Llama-3.1-8B",
-        "Llama3.1-70B": "meta-llama/Llama-3.1-70B",
-        "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
-        "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
-        "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
-        "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
-        "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
-        "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
-        "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
-        "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
-        "Llama3.2-1B": "meta-llama/Llama-3.2-1B",
-        "Llama3.2-3B": "meta-llama/Llama-3.2-3B",
-        "Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
-        "Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
-        "Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
-        "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
-        "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
-        "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
-        "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
-        "Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
-        "Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
-        "Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
-        "Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
-        "Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
-        "Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
-    }
-
     def __init__(self, config: VLLMConfig):
-        Inference.__init__(self)
-        ModelRegistryHelper.__init__(
-            self,
-            stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
-        )
         self.config = config
         self.engine = None
-
-        tokenizer = Tokenizer.get_instance()
-        self.formatter = ChatFormat(tokenizer)
+        self.formatter = ChatFormat(Tokenizer.get_instance())
 
     async def initialize(self):
-        """Initialize the vLLM inference adapter."""
-
         log.info("Initializing vLLM inference adapter")
 
         # Disable usage stats reporting. This would be a surprising thing for most
@@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
         if "VLLM_NO_USAGE_STATS" not in os.environ:
             os.environ["VLLM_NO_USAGE_STATS"] = "1"
 
-        hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model)
+        model = resolve_model(self.config.model)
+        if model is None:
+            raise ValueError(f"Unknown model {self.config.model}")
+
+        if model.huggingface_repo is None:
+            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
 
         # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs()
-        engine_args.model = hf_model
-        # We will need a new config item for this in the future if model support is more broad
-        # than it is today (llama only)
-        engine_args.tokenizer = hf_model
-        engine_args.tensor_parallel_size = self.config.tensor_parallel_size
+        engine_args = AsyncEngineArgs(
+            model=model.huggingface_repo,
+            tokenizer=model.huggingface_repo,
+            tensor_parallel_size=self.config.tensor_parallel_size,
+            enforce_eager=self.config.enforce_eager,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
+            guided_decoding_backend="lm-format-enforcer",
+        )
 
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
@@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
         if self.engine:
             self.engine.shutdown_background_loop()
 
+    async def register_model(self, model: ModelDef) -> None:
+        raise ValueError(
+            "You cannot dynamically add a model to a running vllm instance"
+        )
+
+    async def list_models(self) -> List[ModelDef]:
+        return [
+            ModelDef(
+                identifier=self.config.model,
+                llama_model=self.config.model,
+            )
+        ]
+
+    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
+        if sampling_params is None:
+            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
+
+        # TODO convert what I saw in my first test ... but surely there's more to do here
+        kwargs = {
+            "temperature": sampling_params.temperature,
+            "max_tokens": self.config.max_tokens,
+        }
+        if sampling_params.top_k:
+            kwargs["top_k"] = sampling_params.top_k
+        if sampling_params.top_p:
+            kwargs["top_p"] = sampling_params.top_p
+        if sampling_params.max_tokens:
+            kwargs["max_tokens"] = sampling_params.max_tokens
+        if sampling_params.repetition_penalty > 0:
+            kwargs["repetition_penalty"] = sampling_params.repetition_penalty
+
+        return VLLMSamplingParams(**kwargs)
+
     async def completion(
         self,
         model: str,
         content: InterleavedTextMedia,
-        sampling_params: Any | None = ...,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
     ) -> CompletionResponse | CompletionResponseStreamChunk:
         log.info("vLLM completion")
         messages = [UserMessage(content=content)]
@@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
     async def chat_completion(
         self,
         model: str,
-        messages: list[Message],
-        sampling_params: Any | None = ...,
-        tools: list[ToolDefinition] | None = ...,
-        tool_choice: ToolChoice | None = ...,
-        tool_prompt_format: ToolPromptFormat | None = ...,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
     ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
         log.info("vLLM chat completion")
 
@@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
         request_id = _random_uuid()
 
         prompt = chat_completion_request_to_prompt(request, self.formatter)
-        vllm_sampling_params = _vllm_sampling_params(request.sampling_params)
+        vllm_sampling_params = self._sampling_params(request.sampling_params)
         results_generator = self.engine.generate(
             prompt, vllm_sampling_params, request_id
         )
@@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
         self, request: ChatCompletionRequest, results_generator: AsyncGenerator
     ) -> AsyncGenerator:
         async def _generate_and_convert_to_openai_compat():
+            cur = []
             async for chunk in results_generator:
                 if not chunk.outputs:
                     log.warning("Empty chunk received")
                     continue
 
-                text = "".join([output.text for output in chunk.outputs])
+                output = chunk.outputs[-1]
+
+                new_tokens = output.token_ids[len(cur) :]
+                text = self.formatter.tokenizer.decode(new_tokens)
+                cur.extend(new_tokens)
                 choice = OpenAICompatCompletionChoice(
-                    finish_reason=chunk.outputs[-1].stop_reason,
+                    finish_reason=output.finish_reason,
                     text=text,
                 )
                 yield OpenAICompatCompletionResponse(
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index 4632cdd96..6b0d99a22 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -62,7 +62,7 @@ async def test_eval(eval_settings):
     response = await eval_impl.evaluate_batch(
         dataset_id=response[0].identifier,
         candidate=ModelCandidate(
-            model="Llama3.1-8B-Instruct",
+            model="Llama3.2-1B-Instruct",
             sampling_params=SamplingParams(),
         ),
         scoring_functions=["subset_of"],
diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml
new file mode 100644
index 000000000..a3ff27949
--- /dev/null
+++ b/llama_stack/templates/bedrock/build.yaml
@@ -0,0 +1,9 @@
+name: bedrock
+distribution_spec:
+  description: Use Amazon Bedrock APIs.
+  providers:
+    inference: remote::bedrock
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/databricks/build.yaml b/llama_stack/templates/databricks/build.yaml
new file mode 100644
index 000000000..f6c8b50a1
--- /dev/null
+++ b/llama_stack/templates/databricks/build.yaml
@@ -0,0 +1,9 @@
+name: databricks
+distribution_spec:
+  description: Use Databricks for running LLM inference
+  providers:
+    inference: remote::databricks
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml
new file mode 100644
index 000000000..37129bef0
--- /dev/null
+++ b/llama_stack/templates/fireworks/build.yaml
@@ -0,0 +1,9 @@
+name: fireworks
+distribution_spec:
+  description: Use Fireworks.ai for running LLM inference
+  providers:
+    inference: remote::fireworks
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml
new file mode 100644
index 000000000..6c84e5ccf
--- /dev/null
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@@ -0,0 +1,9 @@
+name: hf-endpoint
+distribution_spec:
+  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
+  providers:
+    inference: remote::hf::endpoint
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml
new file mode 100644
index 000000000..32561c1fa
--- /dev/null
+++ b/llama_stack/templates/hf-serverless/build.yaml
@@ -0,0 +1,9 @@
+name: hf-serverless
+distribution_spec:
+  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
+  providers:
+    inference: remote::hf::serverless
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml
new file mode 100644
index 000000000..d0fe93aa3
--- /dev/null
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@@ -0,0 +1,13 @@
+name: meta-reference-gpu
+distribution_spec:
+  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
+  description: Use code from `llama_stack` itself to serve all llama stack APIs
+  providers:
+    inference: meta-reference
+    memory:
+    - meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
new file mode 100644
index 000000000..20500ea5a
--- /dev/null
+++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
@@ -0,0 +1,13 @@
+name: meta-reference-quantized-gpu
+distribution_spec:
+  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
+  description: Use code from `llama_stack` itself to serve all llama stack APIs
+  providers:
+    inference: meta-reference-quantized
+    memory:
+    - meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
new file mode 100644
index 000000000..06de2fc3c
--- /dev/null
+++ b/llama_stack/templates/ollama/build.yaml
@@ -0,0 +1,12 @@
+name: ollama
+distribution_spec:
+  description: Use ollama for running LLM inference
+  providers:
+    inference: remote::ollama
+    memory:
+    - meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml
new file mode 100644
index 000000000..c5e618bb6
--- /dev/null
+++ b/llama_stack/templates/tgi/build.yaml
@@ -0,0 +1,12 @@
+name: tgi
+distribution_spec:
+  description: Use TGI for running LLM inference
+  providers:
+    inference: remote::tgi
+    memory:
+    - meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
new file mode 100644
index 000000000..5232aeb93
--- /dev/null
+++ b/llama_stack/templates/together/build.yaml
@@ -0,0 +1,9 @@
+name: together
+distribution_spec:
+  description: Use Together.ai for running LLM inference
+  providers:
+    inference: remote::together
+    memory: remote::weaviate
+    safety: remote::together
+    agents: meta-reference
+    telemetry: meta-reference
diff --git a/llama_stack/templates/vllm/build.yaml b/llama_stack/templates/vllm/build.yaml
new file mode 100644
index 000000000..d842896db
--- /dev/null
+++ b/llama_stack/templates/vllm/build.yaml
@@ -0,0 +1,9 @@
+name: vllm
+distribution_spec:
+  description: Like local, but use vLLM for running LLM inference
+  providers:
+    inference: vllm
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference