Merge branch 'main' into fix/issue-2584-llama4-tool-calling-v2

2025-12-23 00:52:26 +00:00 · 2025-07-28 09:06:49 -04:00 · 2025-07-28 09:06:49 -04:00 · 09f42d9d91
commit 09f42d9d91
parent 561912064c 09abdb0a37
183 changed files with 6226 additions and 5684 deletions
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -4,15 +4,83 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import Enum
+from enum import Enum, EnumMeta

-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from llama_stack.schema_utils import json_schema_type


+class DynamicApiMeta(EnumMeta):
+    def __new__(cls, name, bases, namespace):
+        # Store the original enum values
+        original_values = {k: v for k, v in namespace.items() if not k.startswith("_")}
+
+        # Create the enum class
+        cls = super().__new__(cls, name, bases, namespace)
+
+        # Store the original values for reference
+        cls._original_values = original_values
+        # Initialize _dynamic_values
+        cls._dynamic_values = {}
+
+        return cls
+
+    def __call__(cls, value):
+        try:
+            return super().__call__(value)
+        except ValueError as e:
+            # If this value was already dynamically added, return it
+            if value in cls._dynamic_values:
+                return cls._dynamic_values[value]
+
+            # If the value doesn't exist, create a new enum member
+            # Create a new member name from the value
+            member_name = value.lower().replace("-", "_")
+
+            # If this member name already exists in the enum, return the existing member
+            if member_name in cls._member_map_:
+                return cls._member_map_[member_name]
+
+            # Instead of creating a new member, raise ValueError to force users to use Api.add() to
+            # register new APIs explicitly
+            raise ValueError(f"API '{value}' does not exist. Use Api.add() to register new APIs.") from e
+
+    def __iter__(cls):
+        # Allow iteration over both static and dynamic members
+        yield from super().__iter__()
+        if hasattr(cls, "_dynamic_values"):
+            yield from cls._dynamic_values.values()
+
+    def add(cls, value):
+        """
+        Add a new API to the enum.
+        Used to register external APIs.
+        """
+        member_name = value.lower().replace("-", "_")
+
+        # If this member name already exists in the enum, return it
+        if member_name in cls._member_map_:
+            return cls._member_map_[member_name]
+
+        # Create a new enum member
+        member = object.__new__(cls)
+        member._name_ = member_name
+        member._value_ = value
+
+        # Add it to the enum class
+        cls._member_map_[member_name] = member
+        cls._member_names_.append(member_name)
+        cls._member_type_ = str
+
+        # Store it in our dynamic values
+        cls._dynamic_values[value] = member
+
+        return member
+
+
@json_schema_type
-class Api(Enum):
+class Api(Enum, metaclass=DynamicApiMeta):
    providers = "providers"
    inference = "inference"
    safety = "safety"
@ -54,3 +122,12 @@ class Error(BaseModel):
    title: str
    detail: str
    instance: str | None = None
+
+
+class ExternalApiSpec(BaseModel):
+    """Specification for an external API implementation."""
+
+    module: str = Field(..., description="Python module containing the API implementation")
+    name: str = Field(..., description="Name of the API")
+    pip_packages: list[str] = Field(default=[], description="List of pip packages to install the API")
+    protocol: str = Field(..., description="Name of the protocol class for the API")
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -819,12 +819,6 @@ class OpenAIEmbeddingsResponse(BaseModel):
 class ModelStore(Protocol):
    async def get_model(self, identifier: str) -> Model: ...

-    async def update_registered_llm_models(
-        self,
-        provider_id: str,
-        models: list[Model],
-    ) -> None: ...
-

 class TextTruncation(Enum):
    """Config for how to truncate text for embedding when text is longer than the model's max sequence length. Start and End semantics depend on whether the language is left-to-right or right-to-left.
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -22,6 +22,8 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
 # Add this constant near the top of the file, after the imports
 DEFAULT_TTL_DAYS = 7

+REQUIRED_SCOPE = "telemetry.read"
+

@json_schema_type
 class SpanStatus(Enum):
@ -259,7 +261,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces", method="POST")
+    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_traces(
        self,
        attribute_filters: list[QueryCondition] | None = None,
@ -277,7 +279,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET", required_scope=REQUIRED_SCOPE)
    async def get_trace(self, trace_id: str) -> Trace:
        """Get a trace by its ID.

@ -286,7 +288,9 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
+    @webmethod(
+        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET", required_scope=REQUIRED_SCOPE
+    )
    async def get_span(self, trace_id: str, span_id: str) -> Span:
        """Get a span by its ID.

@ -296,7 +300,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST")
+    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST", required_scope=REQUIRED_SCOPE)
    async def get_span_tree(
        self,
        span_id: str,
@ -312,7 +316,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans", method="POST")
+    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_spans(
        self,
        attribute_filters: list[QueryCondition],
@ -345,7 +349,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST")
+    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_metrics(
        self,
        metric_name: str,
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -31,11 +31,13 @@ from llama_stack.distribution.build import (
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
 from llama_stack.distribution.datatypes import (
    BuildConfig,
+    BuildProvider,
    DistributionSpec,
    Provider,
    StackRunConfig,
 )
 from llama_stack.distribution.distribution import get_provider_registry
+from llama_stack.distribution.external import load_external_apis
 from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.stack import replace_env_vars
 from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
@ -93,7 +95,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
            )
            sys.exit(1)
    elif args.providers:
-        providers_list: dict[str, str | list[str]] = dict()
+        provider_list: dict[str, list[BuildProvider]] = dict()
        for api_provider in args.providers.split(","):
            if "=" not in api_provider:
                cprint(
@ -102,7 +104,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                    file=sys.stderr,
                )
                sys.exit(1)
-            api, provider = api_provider.split("=")
+            api, provider_type = api_provider.split("=")
            providers_for_api = get_provider_registry().get(Api(api), None)
            if providers_for_api is None:
                cprint(
@ -111,16 +113,12 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                    file=sys.stderr,
                )
                sys.exit(1)
-            if provider in providers_for_api:
-                if api not in providers_list:
-                    providers_list[api] = []
-                # Use type guarding to ensure we have a list
-                provider_value = providers_list[api]
-                if isinstance(provider_value, list):
-                    provider_value.append(provider)
-                else:
-                    # Convert string to list and append
-                    providers_list[api] = [provider_value, provider]
+            if provider_type in providers_for_api:
+                provider = BuildProvider(
+                    provider_type=provider_type,
+                    module=None,
+                )
+                provider_list.setdefault(api, []).append(provider)
            else:
                cprint(
                    f"{provider} is not a valid provider for the {api} API.",
@ -129,7 +127,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                )
                sys.exit(1)
        distribution_spec = DistributionSpec(
-            providers=providers_list,
+            providers=provider_list,
            description=",".join(args.providers),
        )
        if not args.image_type:
@ -190,7 +188,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:

        cprint("Tip: use <TAB> to see options for the providers.\n", color="green", file=sys.stderr)

-        providers: dict[str, str | list[str]] = dict()
+        providers: dict[str, list[BuildProvider]] = dict()
        for api, providers_for_api in get_provider_registry().items():
            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
            if not available_providers:
@ -205,7 +203,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                ),
            )

-            providers[api.value] = api_provider
+            string_providers = api_provider.split(" ")
+
+            for provider in string_providers:
+                providers.setdefault(api.value, []).append(BuildProvider(provider_type=provider))

        description = prompt(
            "\n > (Optional) Enter a short description for your Llama Stack: ",
@ -236,11 +237,13 @@ def run_stack_build_command(args: argparse.Namespace) -> None:

    if args.print_deps_only:
        print(f"# Dependencies for {args.template or args.config or image_name}")
-        normal_deps, special_deps = get_provider_dependencies(build_config)
+        normal_deps, special_deps, external_provider_dependencies = get_provider_dependencies(build_config)
        normal_deps += SERVER_DEPENDENCIES
        print(f"uv pip install {' '.join(normal_deps)}")
        for special_dep in special_deps:
            print(f"uv pip install {special_dep}")
+        for external_dep in external_provider_dependencies:
+            print(f"uv pip install {external_dep}")
        return

    try:
@ -303,27 +306,25 @@ def _generate_run_config(
    provider_registry = get_provider_registry(build_config)
    for api in apis:
        run_config.providers[api] = []
-        provider_types = build_config.distribution_spec.providers[api]
-        if isinstance(provider_types, str):
-            provider_types = [provider_types]
+        providers = build_config.distribution_spec.providers[api]

-        for i, provider_type in enumerate(provider_types):
-            pid = provider_type.split("::")[-1]
+        for provider in providers:
+            pid = provider.provider_type.split("::")[-1]

-            p = provider_registry[Api(api)][provider_type]
+            p = provider_registry[Api(api)][provider.provider_type]
            if p.deprecation_error:
                raise InvalidProviderError(p.deprecation_error)

            try:
-                config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
-            except ModuleNotFoundError:
+                config_type = instantiate_class_type(provider_registry[Api(api)][provider.provider_type].config_class)
+            except (ModuleNotFoundError, ValueError) as exc:
                # HACK ALERT:
                # This code executes after building is done, the import cannot work since the
                # package is either available in the venv or container - not available on the host.
                # TODO: use a "is_external" flag in ProviderSpec to check if the provider is
                # external
                cprint(
-                    f"Failed to import provider {provider_type} for API {api} - assuming it's external, skipping",
+                    f"Failed to import provider {provider.provider_type} for API {api} - assuming it's external, skipping: {exc}",
                    color="yellow",
                    file=sys.stderr,
                )
@ -336,9 +337,10 @@ def _generate_run_config(
                config = {}

            p_spec = Provider(
-                provider_id=f"{pid}-{i}" if len(provider_types) > 1 else pid,
-                provider_type=provider_type,
+                provider_id=pid,
+                provider_type=provider.provider_type,
                config=config,
+                module=provider.module,
            )
            run_config.providers[api].append(p_spec)

@ -401,9 +403,32 @@ def _run_stack_build_command_from_build_config(
        run_config_file = _generate_run_config(build_config, build_dir, image_name)

    with open(build_file_path, "w") as f:
-        to_write = json.loads(build_config.model_dump_json())
+        to_write = json.loads(build_config.model_dump_json(exclude_none=True))
        f.write(yaml.dump(to_write, sort_keys=False))

+    # We first install the external APIs so that the build process can use them and discover the
+    # providers dependencies
+    if build_config.external_apis_dir:
+        cprint("Installing external APIs", color="yellow", file=sys.stderr)
+        external_apis = load_external_apis(build_config)
+        if external_apis:
+            # install the external APIs
+            packages = []
+            for _, api_spec in external_apis.items():
+                if api_spec.pip_packages:
+                    packages.extend(api_spec.pip_packages)
+                    cprint(
+                        f"Installing {api_spec.name} with pip packages {api_spec.pip_packages}",
+                        color="yellow",
+                        file=sys.stderr,
+                    )
+            return_code = run_command(["uv", "pip", "install", *packages])
+            if return_code != 0:
+                packages_str = ", ".join(packages)
+                raise RuntimeError(
+                    f"Failed to install external APIs packages: {packages_str} (return code: {return_code})"
+                )
+
    return_code = build_image(
        build_config,
        build_file_path,
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -14,6 +14,7 @@ from termcolor import cprint

 from llama_stack.distribution.datatypes import BuildConfig
 from llama_stack.distribution.distribution import get_provider_registry
+from llama_stack.distribution.external import load_external_apis
 from llama_stack.distribution.utils.exec import run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
@ -41,7 +42,7 @@ class ApiInput(BaseModel):

 def get_provider_dependencies(
    config: BuildConfig | DistributionTemplate,
-) -> tuple[list[str], list[str]]:
+) -> tuple[list[str], list[str], list[str]]:
    """Get normal and special dependencies from provider configuration."""
    if isinstance(config, DistributionTemplate):
        config = config.build_config()
@ -50,6 +51,7 @@ def get_provider_dependencies(
    additional_pip_packages = config.additional_pip_packages

    deps = []
+    external_provider_deps = []
    registry = get_provider_registry(config)
    for api_str, provider_or_providers in providers.items():
        providers_for_api = registry[Api(api_str)]
@ -64,8 +66,16 @@ def get_provider_dependencies(
                raise ValueError(f"Provider `{provider}` is not available for API `{api_str}`")

            provider_spec = providers_for_api[provider_type]
-            deps.extend(provider_spec.pip_packages)
-            if provider_spec.container_image:
+            if hasattr(provider_spec, "is_external") and provider_spec.is_external:
+                # this ensures we install the top level module for our external providers
+                if provider_spec.module:
+                    if isinstance(provider_spec.module, str):
+                        external_provider_deps.append(provider_spec.module)
+                    else:
+                        external_provider_deps.extend(provider_spec.module)
+            if hasattr(provider_spec, "pip_packages"):
+                deps.extend(provider_spec.pip_packages)
+            if hasattr(provider_spec, "container_image") and provider_spec.container_image:
                raise ValueError("A stack's dependencies cannot have a container image")

    normal_deps = []
@ -78,7 +88,7 @@ def get_provider_dependencies(

    normal_deps.extend(additional_pip_packages or [])

-    return list(set(normal_deps)), list(set(special_deps))
+    return list(set(normal_deps)), list(set(special_deps)), list(set(external_provider_deps))


 def print_pip_install_help(config: BuildConfig):
@ -103,41 +113,59 @@ def build_image(
 ):
    container_base = build_config.distribution_spec.container_image or "python:3.12-slim"

-    normal_deps, special_deps = get_provider_dependencies(build_config)
+    normal_deps, special_deps, external_provider_deps = get_provider_dependencies(build_config)
    normal_deps += SERVER_DEPENDENCIES
+    if build_config.external_apis_dir:
+        external_apis = load_external_apis(build_config)
+        if external_apis:
+            for _, api_spec in external_apis.items():
+                normal_deps.extend(api_spec.pip_packages)

    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
        script = str(importlib.resources.files("llama_stack") / "distribution/build_container.sh")
        args = [
            script,
+            "--template-or-config",
            template_or_config,
+            "--image-name",
            image_name,
+            "--container-base",
            container_base,
+            "--normal-deps",
            " ".join(normal_deps),
        ]
-
        # When building from a config file (not a template), include the run config path in the
        # build arguments
        if run_config is not None:
-            args.append(run_config)
+            args.extend(["--run-config", run_config])
    elif build_config.image_type == LlamaStackImageType.CONDA.value:
        script = str(importlib.resources.files("llama_stack") / "distribution/build_conda_env.sh")
        args = [
            script,
+            "--env-name",
            str(image_name),
+            "--build-file-path",
            str(build_file_path),
+            "--normal-deps",
            " ".join(normal_deps),
        ]
    elif build_config.image_type == LlamaStackImageType.VENV.value:
        script = str(importlib.resources.files("llama_stack") / "distribution/build_venv.sh")
        args = [
            script,
+            "--env-name",
            str(image_name),
+            "--normal-deps",
            " ".join(normal_deps),
        ]

+    # Always pass both arguments, even if empty, to maintain consistent positional arguments
    if special_deps:
-        args.append("#".join(special_deps))
+        args.extend(["--optional-deps", "#".join(special_deps)])
+    if external_provider_deps:
+        args.extend(
+            ["--external-provider-deps", "#".join(external_provider_deps)]
+        )  # the script will install external provider module, get its deps, and install those too.

    return_code = run_command(args)

--- a/llama_stack/distribution/build_conda_env.sh
+++ b/llama_stack/distribution/build_conda_env.sh
@ -9,10 +9,91 @@
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+PYPI_VERSION=${PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}

+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+build_file_path=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --build-file-path)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --build-file-path requires a string value" >&2
+        usage
+      fi
+      build_file_path="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
+  usage
+fi
+
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
@ -20,50 +101,18 @@ if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi

-if [ "$#" -lt 3 ]; then
-  echo "Usage: $0 <distribution_type> <conda_env_name> <build_file_path> <pip_dependencies> [<special_pip_deps>]" >&2
-  echo "Example: $0 <distribution_type> my-conda-env ./my-stack-build.yaml 'numpy pandas scipy'" >&2
-  exit 1
-fi
-
-special_pip_deps="$4"
-
-set -euo pipefail
-
-env_name="$1"
-build_file_path="$2"
-pip_dependencies="$3"
-
-# Define color codes
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-NC='\033[0m' # No Color
-
-# this is set if we actually create a new conda in which case we need to clean up
-ENVNAME=""
-
-SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
-source "$SCRIPT_DIR/common.sh"
-
 ensure_conda_env_python310() {
-  local env_name="$1"
-  local pip_dependencies="$2"
-  local special_pip_deps="$3"
+  # Use only global variables set by flag parser
  local python_version="3.12"

-  # Check if conda command is available
  if ! is_command_available conda; then
    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
    exit 1
  fi

-  # Check if the environment exists
  if conda env list | grep -q "^${env_name} "; then
    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
-
-    # Check Python version in the environment
    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
-
    if [ "$current_version" = "$python_version" ]; then
      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
    else
@ -73,37 +122,37 @@ ensure_conda_env_python310() {
  else
    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
    conda create -n "${env_name}" python="${python_version}" -y
-
-    ENVNAME="${env_name}"
-    # setup_cleanup_handlers
  fi

  eval "$(conda shell.bash hook)"
  conda deactivate && conda activate "${env_name}"
-
  "$CONDA_PREFIX"/bin/pip install uv

  if [ -n "$TEST_PYPI_VERSION" ]; then
-    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      llama-stack=="$TEST_PYPI_VERSION" \
-      "$pip_dependencies"
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+      "$normal_deps"
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install "$part"
      done
    fi
  else
-    # Re-installing llama-stack in the new conda environment
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
@ -115,31 +164,44 @@ ensure_conda_env_python310() {
      fi
      uv pip install --no-cache-dir "$SPEC_VERSION"
    fi
-
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi
-
-    # Install pip dependencies
    printf "Installing pip dependencies\n"
-    uv pip install $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
  fi
-
  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
 }

-ensure_conda_env_python310 "$env_name" "$pip_dependencies" "$special_pip_deps"
+ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -27,52 +27,103 @@ RUN_CONFIG_PATH=/app/run.yaml

 BUILD_CONTEXT_DIR=$(pwd)

-if [ "$#" -lt 4 ]; then
-  # This only works for templates
-  echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<run_config>] [<special_pip_deps>]" >&2
-  exit 1
-fi
 set -euo pipefail

-template_or_config="$1"
-shift
-image_name="$1"
-shift
-container_base="$1"
-shift
-pip_dependencies="$1"
-shift
-
-# Handle optional arguments
-run_config=""
-special_pip_deps=""
-
-# Check if there are more arguments
-# The logics is becoming cumbersom, we should refactor it if we can do better
-if [ $# -gt 0 ]; then
-  # Check if the argument ends with .yaml
-  if [[ "$1" == *.yaml ]]; then
-    run_config="$1"
-    shift
-    # If there's another argument after .yaml, it must be special_pip_deps
-    if [ $# -gt 0 ]; then
-      special_pip_deps="$1"
-    fi
-  else
-    # If it's not .yaml, it must be special_pip_deps
-    special_pip_deps="$1"
-  fi
-fi
-
 # Define color codes
 RED='\033[0;31m'
 NC='\033[0m' # No Color

+# Usage function
+usage() {
+  echo "Usage: $0 --image-name <image_name> --container-base <container_base> --normal-deps <pip_dependencies> [--run-config <run_config>] [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --image-name llama-stack-img --container-base python:3.12-slim --normal-deps 'numpy pandas' --run-config ./run.yaml --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+image_name=""
+container_base=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+run_config=""
+template_or_config=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --image-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --image-name requires a string value" >&2
+        usage
+      fi
+      image_name="$2"
+      shift 2
+      ;;
+    --container-base)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --container-base requires a string value" >&2
+        usage
+      fi
+      container_base="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    --run-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --run-config requires a string value" >&2
+        usage
+      fi
+      run_config="$2"
+      shift 2
+      ;;
+    --template-or-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --template-or-config requires a string value" >&2
+        usage
+      fi
+      template_or_config="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$image_name" || -z "$container_base" || -z "$normal_deps" ]]; then
+  echo "Error: --image-name, --container-base, and --normal-deps are required." >&2
+  usage
+fi
+
 CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
 CONTAINER_OPTS=${CONTAINER_OPTS:---progress=plain}
-
 TEMP_DIR=$(mktemp -d)
-
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"

@ -81,18 +132,15 @@ add_to_container() {
  if [ -t 0 ]; then
    printf '%s\n' "$1" >>"$output_file"
  else
-    # If stdin is not a terminal, read from it (heredoc)
    cat >>"$output_file"
  fi
 }

-# Check if container command is available
 if ! is_command_available "$CONTAINER_BINARY"; then
  printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
  exit 1
 fi

-# Update and install UBI9 components if UBI9 base image is used
 if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
  add_to_container << EOF
 FROM $container_base
@ -135,16 +183,16 @@ EOF

 # Add pip dependencies first since llama-stack is what will change most often
 # so we can reuse layers.
-if [ -n "$pip_dependencies" ]; then
-  read -ra pip_args <<< "$pip_dependencies"
+if [ -n "$normal_deps" ]; then
+  read -ra pip_args <<<  "$normal_deps"
  quoted_deps=$(printf " %q" "${pip_args[@]}")
  add_to_container << EOF
 RUN $MOUNT_CACHE uv pip install $quoted_deps
 EOF
 fi

-if [ -n "$special_pip_deps" ]; then
-  IFS='#' read -ra parts <<<"$special_pip_deps"
+if [ -n "$optional_deps" ]; then
+  IFS='#' read -ra parts <<<"$optional_deps"
  for part in "${parts[@]}"; do
    read -ra pip_args <<< "$part"
    quoted_deps=$(printf " %q" "${pip_args[@]}")
@ -154,7 +202,33 @@ EOF
  done
 fi

-# Function to get Python command
+if [ -n "$external_provider_deps" ]; then
+  IFS='#' read -ra parts <<<"$external_provider_deps"
+  for part in "${parts[@]}"; do
+    read -ra pip_args <<< "$part"
+    quoted_deps=$(printf " %q" "${pip_args[@]}")
+    add_to_container <<EOF
+RUN $MOUNT_CACHE uv pip install $quoted_deps
+EOF
+    add_to_container <<EOF
+RUN python3 - <<PYTHON | $MOUNT_CACHE uv pip install -r -
+import importlib
+import sys
+
+try:
+    package_name = '$part'.split('==')[0].split('>=')[0].split('<=')[0].split('!=')[0].split('<')[0].split('>')[0]
+    module = importlib.import_module(f'{package_name}.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        if isinstance(spec.pip_packages, (list, tuple)):
+            print('\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for {package_name}: {e}', file=sys.stderr)
+PYTHON
+EOF
+  done
+fi
+
 get_python_cmd() {
    if is_command_available python; then
        echo "python"
--- a/llama_stack/distribution/build_venv.sh
+++ b/llama_stack/distribution/build_venv.sh
@ -18,6 +18,76 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 UV_SYSTEM_PYTHON=${UV_SYSTEM_PYTHON:-}
 VIRTUAL_ENV=${VIRTUAL_ENV:-}

+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <env_name> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name mybuild --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name and --normal-deps are required." >&2
+  usage
+fi
+
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
@ -25,29 +95,6 @@ if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi

-if [ "$#" -lt 2 ]; then
-  echo "Usage: $0 <env_name> <pip_dependencies> [<special_pip_deps>]" >&2
-  echo "Example: $0 mybuild ./my-stack-build.yaml 'numpy pandas scipy'" >&2
-  exit 1
-fi
-
-special_pip_deps="$3"
-
-set -euo pipefail
-
-env_name="$1"
-pip_dependencies="$2"
-
-# Define color codes
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-
-# this is set if we actually create a new conda in which case we need to clean up
-ENVNAME=""
-
-SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
-source "$SCRIPT_DIR/common.sh"
-
 # pre-run checks to make sure we can proceed with the installation
 pre_run_checks() {
  local env_name="$1"
@ -71,49 +118,44 @@ pre_run_checks() {
 }

 run() {
-  local env_name="$1"
-  local pip_dependencies="$2"
-  local special_pip_deps="$3"
-
+  # Use only global variables set by flag parser
  if [ -n "$UV_SYSTEM_PYTHON" ] || [ "$env_name" == "__system__" ]; then
    echo "Installing dependencies in system Python environment"
-    # if env == __system__, ensure we set UV_SYSTEM_PYTHON
    export UV_SYSTEM_PYTHON=1
  elif [ "$VIRTUAL_ENV" == "$env_name" ]; then
    echo "Virtual environment $env_name is already active"
  else
    echo "Using virtual environment $env_name"
    uv venv "$env_name"
-    # shellcheck source=/dev/null
    source "$env_name/bin/activate"
  fi

  if [ -n "$TEST_PYPI_VERSION" ]; then
-    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
-    # shellcheck disable=SC2086
-    # we are building a command line so word splitting is expected
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      --index-strategy unsafe-best-match \
      llama-stack=="$TEST_PYPI_VERSION" \
-      $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+      $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
-        # shellcheck disable=SC2086
-        # we are building a command line so word splitting is expected
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install "$part"
+      done
+    fi
  else
-    # Re-installing llama-stack in the new virtual environment
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
@ -125,27 +167,41 @@ run() {
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi

-    # Install pip dependencies
    printf "Installing pip dependencies\n"
-    # shellcheck disable=SC2086
-    # we are building a command line so word splitting is expected
-    uv pip install $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
-        echo "$part"
-        # shellcheck disable=SC2086
-        # we are building a command line so word splitting is expected
+        echo "Installing special provider module: $part"
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Installing external provider module: $part"
+        uv pip install "$part"
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
  fi
 }

 pre_run_checks "$env_name"
-run "$env_name" "$pip_dependencies" "$special_pip_deps"
+run
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@ -91,21 +91,22 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec

            logger.info(f"Configuring API `{api_str}`...")
            updated_providers = []
-            for i, provider_type in enumerate(plist):
+            for i, provider in enumerate(plist):
                if i >= 1:
-                    others = ", ".join(plist[i:])
+                    others = ", ".join(p.provider_type for p in plist[i:])
                    logger.info(
                        f"Not configuring other providers ({others}) interactively. Please edit the resulting YAML directly.\n"
                    )
                    break

-                logger.info(f"> Configuring provider `({provider_type})`")
+                logger.info(f"> Configuring provider `({provider.provider_type})`")
+                pid = provider.provider_type.split("::")[-1]
                updated_providers.append(
                    configure_single_provider(
                        provider_registry[api],
                        Provider(
-                            provider_id=(f"{provider_type}-{i:02d}" if len(plist) > 1 else provider_type),
-                            provider_type=provider_type,
+                            provider_id=(f"{pid}-{i:02d}" if len(plist) > 1 else pid),
+                            provider_type=provider.provider_type,
                            config={},
                        ),
                    )
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -36,6 +36,11 @@ LLAMA_STACK_RUN_CONFIG_VERSION = 2
 RoutingKey = str | list[str]


+class RegistryEntrySource(StrEnum):
+    via_register_api = "via_register_api"
+    listed_from_provider = "listed_from_provider"
+
+
 class User(BaseModel):
    principal: str
    # further attributes that may be used for access control decisions
@ -50,6 +55,7 @@ class ResourceWithOwner(Resource):
    resource. This can be used to constrain access to the resource."""

    owner: User | None = None
+    source: RegistryEntrySource = RegistryEntrySource.via_register_api


 # Use the extended Resource for all routable objects
@ -130,29 +136,54 @@ class RoutingTableProviderSpec(ProviderSpec):
    pip_packages: list[str] = Field(default_factory=list)


+class Provider(BaseModel):
+    # provider_id of None means that the provider is not enabled - this happens
+    # when the provider is enabled via a conditional environment variable
+    provider_id: str | None
+    provider_type: str
+    config: dict[str, Any] = {}
+    module: str | None = Field(
+        default=None,
+        description="""
+ Fully-qualified name of the external provider module to import. The module is expected to have:
+
+  - `get_adapter_impl(config, deps)`: returns the adapter implementation
+
+  Example: `module: ramalama_stack`
+ """,
+    )
+
+
+class BuildProvider(BaseModel):
+    provider_type: str
+    module: str | None = Field(
+        default=None,
+        description="""
+ Fully-qualified name of the external provider module to import. The module is expected to have:
+
+  - `get_adapter_impl(config, deps)`: returns the adapter implementation
+
+  Example: `module: ramalama_stack`
+ """,
+    )
+
+
 class DistributionSpec(BaseModel):
    description: str | None = Field(
        default="",
        description="Description of the distribution",
    )
    container_image: str | None = None
-    providers: dict[str, str | list[str]] = Field(
+    providers: dict[str, list[BuildProvider]] = Field(
        default_factory=dict,
        description="""
-Provider Types for each of the APIs provided by this distribution. If you
-select multiple providers, you should provide an appropriate 'routing_map'
-in the runtime configuration to help route to the correct provider.""",
+        Provider Types for each of the APIs provided by this distribution. If you
+        select multiple providers, you should provide an appropriate 'routing_map'
+        in the runtime configuration to help route to the correct provider.
+        """,
    )


-class Provider(BaseModel):
-    # provider_id of None means that the provider is not enabled - this happens
-    # when the provider is enabled via a conditional environment variable
-    provider_id: str | None
-    provider_type: str
-    config: dict[str, Any]
-
-
 class LoggingConfig(BaseModel):
    category_levels: dict[str, str] = Field(
        default_factory=dict,
@ -381,6 +412,11 @@ a default SQLite store will be used.""",
        description="Path to directory containing external provider implementations. The providers code and dependencies must be installed on the system.",
    )

+    external_apis_dir: Path | None = Field(
+        default=None,
+        description="Path to directory containing external API implementations. The APIs code and dependencies must be installed on the system.",
+    )
+
    @field_validator("external_providers_dir")
    @classmethod
    def validate_external_providers_dir(cls, v):
@ -412,6 +448,10 @@ class BuildConfig(BaseModel):
        default_factory=list,
        description="Additional pip packages to install in the distribution. These packages will be installed in the distribution environment.",
    )
+    external_apis_dir: Path | None = Field(
+        default=None,
+        description="Path to directory containing external API implementations. The APIs code and dependencies must be installed on the system.",
+    )

    @field_validator("external_providers_dir")
    @classmethod
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@ -12,6 +12,8 @@ from typing import Any
 import yaml
 from pydantic import BaseModel

+from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
+from llama_stack.distribution.external import load_external_apis
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
    AdapterSpec,
@ -96,12 +98,10 @@ def _load_inline_provider_spec(spec_data: dict[str, Any], api: Api, provider_nam
    return spec


-def get_provider_registry(
-    config=None,
-) -> dict[Api, dict[str, ProviderSpec]]:
+def get_provider_registry(config=None) -> dict[Api, dict[str, ProviderSpec]]:
    """Get the provider registry, optionally including external providers.

-    This function loads both built-in providers and external providers from YAML files.
+    This function loads both built-in providers and external providers from YAML files or from their provided modules.
    External providers are loaded from a directory structure like:

    providers.d/
@ -122,8 +122,13 @@ def get_provider_registry(
        safety/
          llama-guard.yaml

+    This method is overloaded in that it can be called from a variety of places: during build, during run, during stack construction.
+    So when building external providers from a module, there are scenarios where the pip package required to import the module might not be available yet.
+    There is special handling for all of the potential cases this method can be called from.
+
    Args:
        config: Optional object containing the external providers directory path
+        building: Optional bool delineating whether or not this is being called from a build process

    Returns:
        A dictionary mapping APIs to their available providers
@ -133,58 +138,140 @@ def get_provider_registry(
        ValueError: If any provider spec is invalid
    """

-    ret: dict[Api, dict[str, ProviderSpec]] = {}
+    registry: dict[Api, dict[str, ProviderSpec]] = {}
    for api in providable_apis():
        name = api.name.lower()
        logger.debug(f"Importing module {name}")
        try:
            module = importlib.import_module(f"llama_stack.providers.registry.{name}")
-            ret[api] = {a.provider_type: a for a in module.available_providers()}
+            registry[api] = {a.provider_type: a for a in module.available_providers()}
        except ImportError as e:
            logger.warning(f"Failed to import module {name}: {e}")

-    # Check if config has the external_providers_dir attribute
-    if config and hasattr(config, "external_providers_dir") and config.external_providers_dir:
-        external_providers_dir = os.path.abspath(os.path.expanduser(config.external_providers_dir))
-        if not os.path.exists(external_providers_dir):
-            raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
-        logger.info(f"Loading external providers from {external_providers_dir}")
+    # Refresh providable APIs with external APIs if any
+    external_apis = load_external_apis(config)
+    for api, api_spec in external_apis.items():
+        name = api_spec.name.lower()
+        logger.info(f"Importing external API {name} module {api_spec.module}")
+        try:
+            module = importlib.import_module(api_spec.module)
+            registry[api] = {a.provider_type: a for a in module.available_providers()}
+        except (ImportError, AttributeError) as e:
+            # Populate the registry with an empty dict to avoid breaking the provider registry
+            # This assume that the in-tree provider(s) are not available for this API which means
+            # that users will need to use external providers for this API.
+            registry[api] = {}
+            logger.error(
+                f"Failed to import external API {name}: {e}. Could not populate the in-tree provider(s) registry for {api.name}. \n"
+                "Install the API package to load any in-tree providers for this API."
+            )

-        for api in providable_apis():
-            api_name = api.name.lower()
+    # Check if config has external providers
+    if config:
+        if hasattr(config, "external_providers_dir") and config.external_providers_dir:
+            registry = get_external_providers_from_dir(registry, config)
+        # else lets check for modules in each provider
+        registry = get_external_providers_from_module(
+            registry=registry,
+            config=config,
+            building=(isinstance(config, BuildConfig) or isinstance(config, DistributionSpec)),
+        )

-            # Process both remote and inline providers
-            for provider_type in ["remote", "inline"]:
-                api_dir = os.path.join(external_providers_dir, provider_type, api_name)
-                if not os.path.exists(api_dir):
-                    logger.debug(f"No {provider_type} provider directory found for {api_name}")
-                    continue
+    return registry

-                # Look for provider spec files in the API directory
-                for spec_path in glob.glob(os.path.join(api_dir, "*.yaml")):
-                    provider_name = os.path.splitext(os.path.basename(spec_path))[0]
-                    logger.info(f"Loading {provider_type} provider spec from {spec_path}")

-                    try:
-                        with open(spec_path) as f:
-                            spec_data = yaml.safe_load(f)
+def get_external_providers_from_dir(
+    registry: dict[Api, dict[str, ProviderSpec]], config
+) -> dict[Api, dict[str, ProviderSpec]]:
+    logger.warning(
+        "Specifying external providers via `external_providers_dir` is being deprecated. Please specify `module:` in the provider instead."
+    )
+    external_providers_dir = os.path.abspath(os.path.expanduser(config.external_providers_dir))
+    if not os.path.exists(external_providers_dir):
+        raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
+    logger.info(f"Loading external providers from {external_providers_dir}")

-                        if provider_type == "remote":
-                            spec = _load_remote_provider_spec(spec_data, api)
-                            provider_type_key = f"remote::{provider_name}"
-                        else:
-                            spec = _load_inline_provider_spec(spec_data, api, provider_name)
-                            provider_type_key = f"inline::{provider_name}"
+    for api in providable_apis():
+        api_name = api.name.lower()

-                        logger.info(f"Loaded {provider_type} provider spec for {provider_type_key} from {spec_path}")
-                        if provider_type_key in ret[api]:
-                            logger.warning(f"Overriding already registered provider {provider_type_key} for {api.name}")
-                        ret[api][provider_type_key] = spec
-                        logger.info(f"Successfully loaded external provider {provider_type_key}")
-                    except yaml.YAMLError as yaml_err:
-                        logger.error(f"Failed to parse YAML file {spec_path}: {yaml_err}")
-                        raise yaml_err
-                    except Exception as e:
-                        logger.error(f"Failed to load provider spec from {spec_path}: {e}")
-                        raise e
-    return ret
+        # Process both remote and inline providers
+        for provider_type in ["remote", "inline"]:
+            api_dir = os.path.join(external_providers_dir, provider_type, api_name)
+            if not os.path.exists(api_dir):
+                logger.debug(f"No {provider_type} provider directory found for {api_name}")
+                continue
+
+            # Look for provider spec files in the API directory
+            for spec_path in glob.glob(os.path.join(api_dir, "*.yaml")):
+                provider_name = os.path.splitext(os.path.basename(spec_path))[0]
+                logger.info(f"Loading {provider_type} provider spec from {spec_path}")
+
+                try:
+                    with open(spec_path) as f:
+                        spec_data = yaml.safe_load(f)
+
+                    if provider_type == "remote":
+                        spec = _load_remote_provider_spec(spec_data, api)
+                        provider_type_key = f"remote::{provider_name}"
+                    else:
+                        spec = _load_inline_provider_spec(spec_data, api, provider_name)
+                        provider_type_key = f"inline::{provider_name}"
+
+                    logger.info(f"Loaded {provider_type} provider spec for {provider_type_key} from {spec_path}")
+                    if provider_type_key in registry[api]:
+                        logger.warning(f"Overriding already registered provider {provider_type_key} for {api.name}")
+                    registry[api][provider_type_key] = spec
+                    logger.info(f"Successfully loaded external provider {provider_type_key}")
+                except yaml.YAMLError as yaml_err:
+                    logger.error(f"Failed to parse YAML file {spec_path}: {yaml_err}")
+                    raise yaml_err
+                except Exception as e:
+                    logger.error(f"Failed to load provider spec from {spec_path}: {e}")
+                    raise e
+
+    return registry
+
+
+def get_external_providers_from_module(
+    registry: dict[Api, dict[str, ProviderSpec]], config, building: bool
+) -> dict[Api, dict[str, ProviderSpec]]:
+    provider_list = None
+    if isinstance(config, BuildConfig):
+        provider_list = config.distribution_spec.providers.items()
+    else:
+        provider_list = config.providers.items()
+    if provider_list is None:
+        logger.warning("Could not get list of providers from config")
+        return registry
+    for provider_api, providers in provider_list:
+        for provider in providers:
+            if not hasattr(provider, "module") or provider.module is None:
+                continue
+            # get provider using module
+            try:
+                if not building:
+                    package_name = provider.module.split("==")[0]
+                    module = importlib.import_module(f"{package_name}.provider")
+                    # if config class is wrong you will get an error saying module could not be imported
+                    spec = module.get_provider_spec()
+                else:
+                    # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
+                    spec = ProviderSpec(
+                        api=Api(provider_api),
+                        provider_type=provider.provider_type,
+                        is_external=True,
+                        module=provider.module,
+                        config_class="",
+                    )
+                provider_type = provider.provider_type
+                # in the case we are building we CANNOT import this module of course because it has not been installed.
+                # return a partially filled out spec that the build script will populate.
+                registry[Api(provider_api)][provider_type] = spec
+            except ModuleNotFoundError as exc:
+                raise ValueError(
+                    "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"
+                ) from exc
+            except Exception as e:
+                logger.error(f"Failed to load provider spec from module {provider.module}: {e}")
+                raise e
+    return registry
--- a/llama_stack/distribution/external.py
+++ b/llama_stack/distribution/external.py
@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import yaml
+
+from llama_stack.apis.datatypes import Api, ExternalApiSpec
+from llama_stack.distribution.datatypes import BuildConfig, StackRunConfig
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="core")
+
+
+def load_external_apis(config: StackRunConfig | BuildConfig | None) -> dict[Api, ExternalApiSpec]:
+    """Load external API specifications from the configured directory.
+
+    Args:
+        config: StackRunConfig or BuildConfig containing the external APIs directory path
+
+    Returns:
+        A dictionary mapping API names to their specifications
+    """
+    if not config or not config.external_apis_dir:
+        return {}
+
+    external_apis_dir = config.external_apis_dir.expanduser().resolve()
+    if not external_apis_dir.is_dir():
+        logger.error(f"External APIs directory is not a directory: {external_apis_dir}")
+        return {}
+
+    logger.info(f"Loading external APIs from {external_apis_dir}")
+    external_apis: dict[Api, ExternalApiSpec] = {}
+
+    # Look for YAML files in the external APIs directory
+    for yaml_path in external_apis_dir.glob("*.yaml"):
+        try:
+            with open(yaml_path) as f:
+                spec_data = yaml.safe_load(f)
+
+            spec = ExternalApiSpec(**spec_data)
+            api = Api.add(spec.name)
+            logger.info(f"Loaded external API spec for {spec.name} from {yaml_path}")
+            external_apis[api] = spec
+        except yaml.YAMLError as yaml_err:
+            logger.error(f"Failed to parse YAML file {yaml_path}: {yaml_err}")
+            raise
+        except Exception:
+            logger.exception(f"Failed to load external API spec from {yaml_path}")
+            raise
+
+    return external_apis
--- a/llama_stack/distribution/inspect.py
+++ b/llama_stack/distribution/inspect.py
@ -16,6 +16,7 @@ from llama_stack.apis.inspect import (
    VersionInfo,
 )
 from llama_stack.distribution.datatypes import StackRunConfig
+from llama_stack.distribution.external import load_external_apis
 from llama_stack.distribution.server.routes import get_all_api_routes
 from llama_stack.providers.datatypes import HealthStatus

@ -42,7 +43,8 @@ class DistributionInspectImpl(Inspect):
        run_config: StackRunConfig = self.config.run_config

        ret = []
-        all_endpoints = get_all_api_routes()
+        external_apis = load_external_apis(run_config)
+        all_endpoints = get_all_api_routes(external_apis)
        for api, endpoints in all_endpoints.items():
            # Always include provider and inspect APIs, filter others based on run config
            if api.value in ["providers", "inspect"]:
@ -53,7 +55,8 @@ class DistributionInspectImpl(Inspect):
                            method=next(iter([m for m in e.methods if m != "HEAD"])),
                            provider_types=[],  # These APIs don't have "real" providers - they're internal to the stack
                        )
-                        for e in endpoints
+                        for e, _ in endpoints
+                        if e.methods is not None
                    ]
                )
            else:
@ -66,7 +69,8 @@ class DistributionInspectImpl(Inspect):
                                method=next(iter([m for m in e.methods if m != "HEAD"])),
                                provider_types=[p.provider_type for p in providers],
                            )
-                            for e in endpoints
+                            for e, _ in endpoints
+                            if e.methods is not None
                        ]
                    )

--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -33,7 +33,7 @@ from termcolor import cprint

 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-from llama_stack.distribution.datatypes import Api, BuildConfig, DistributionSpec
+from llama_stack.distribution.datatypes import Api, BuildConfig, BuildProvider, DistributionSpec
 from llama_stack.distribution.request_headers import (
    PROVIDER_DATA_VAR,
    request_provider_data_context,
@ -161,7 +161,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            if not self.skip_logger_removal:
                self._remove_root_logger_handlers()

-        return self.loop.run_until_complete(self.async_client.initialize())
+        # use a new event loop to avoid interfering with the main event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            return loop.run_until_complete(self.async_client.initialize())
+        finally:
+            asyncio.set_event_loop(None)

    def _remove_root_logger_handlers(self):
        """
@ -243,15 +249,16 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                file=sys.stderr,
            )
            if self.config_path_or_template_name.endswith(".yaml"):
-                # Convert Provider objects to their types
-                provider_types: dict[str, str | list[str]] = {}
-                for api, providers in self.config.providers.items():
-                    types = [p.provider_type for p in providers]
-                    # Convert single-item lists to strings
-                    provider_types[api] = types[0] if len(types) == 1 else types
+                providers: dict[str, list[BuildProvider]] = {}
+                for api, run_providers in self.config.providers.items():
+                    for provider in run_providers:
+                        providers.setdefault(api, []).append(
+                            BuildProvider(provider_type=provider.provider_type, module=provider.module)
+                        )
+                providers = dict(providers)
                build_config = BuildConfig(
                    distribution_spec=DistributionSpec(
-                        providers=provider_types,
+                        providers=providers,
                    ),
                    external_providers_dir=self.config.external_providers_dir,
                )
@ -353,13 +360,15 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        body = options.params or {}
        body |= options.json_data or {}

-        matched_func, path_params, route = find_matching_route(options.method, path, self.route_impls)
+        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

        body, field_names = self._handle_file_uploads(options, body)

        body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
-        await start_trace(route, {"__location__": "library_client"})
+
+        trace_path = webmethod.descriptive_name or route_path
+        await start_trace(trace_path, {"__location__": "library_client"})
        try:
            result = await matched_func(**body)
        finally:
@ -409,12 +418,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        path = options.url
        body = options.params or {}
        body |= options.json_data or {}
-        func, path_params, route = find_matching_route(options.method, path, self.route_impls)
+        func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

        body = self._convert_body(path, options.method, body)

-        await start_trace(route, {"__location__": "library_client"})
+        trace_path = webmethod.descriptive_name or route_path
+        await start_trace(trace_path, {"__location__": "library_client"})

        async def gen():
            try:
@ -445,8 +455,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        # we use asynchronous impl always internally and channel all requests to AsyncLlamaStackClient
        # however, the top-level caller may be a SyncAPIClient -- so its stream_cls might be a Stream (SyncStream)
        # so we need to convert it to AsyncStream
+        # mypy can't track runtime variables inside the [...] of a generic, so ignore that check
        args = get_args(stream_cls)
-        stream_cls = AsyncStream[args[0]]
+        stream_cls = AsyncStream[args[0]]  # type: ignore[valid-type]
        response = AsyncAPIResponse(
            raw=mock_response,
            client=self,
@ -468,7 +479,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        exclude_params = exclude_params or set()

-        func, _, _ = find_matching_route(method, path, self.route_impls)
+        func, _, _, _ = find_matching_route(method, path, self.route_impls)
        sig = inspect.signature(func)

        # Strip NOT_GIVENs to use the defaults in signature
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@ -101,3 +101,15 @@ def get_authenticated_user() -> User | None:
    if not provider_data:
        return None
    return provider_data.get("__authenticated_user")
+
+
+def user_from_scope(scope: dict) -> User | None:
+    """Create a User object from ASGI scope data (set by authentication middleware)"""
+    user_attributes = scope.get("user_attributes", {})
+    principal = scope.get("principal", "")
+
+    # auth not enabled
+    if not principal and not user_attributes:
+        return None
+
+    return User(principal=principal, attributes=user_attributes)
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -11,6 +11,7 @@ from llama_stack.apis.agents import Agents
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.datatypes import ExternalApiSpec
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InferenceProvider
@ -35,6 +36,7 @@ from llama_stack.distribution.datatypes import (
    StackRunConfig,
 )
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
+from llama_stack.distribution.external import load_external_apis
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
@ -59,8 +61,16 @@ class InvalidProviderError(Exception):
    pass


-def api_protocol_map() -> dict[Api, Any]:
-    return {
+def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) -> dict[Api, Any]:
+    """Get a mapping of API types to their protocol classes.
+
+    Args:
+        external_apis: Optional dictionary of external API specifications
+
+    Returns:
+        Dictionary mapping API types to their protocol classes
+    """
+    protocols = {
        Api.providers: ProvidersAPI,
        Api.agents: Agents,
        Api.inference: Inference,
@ -83,10 +93,23 @@ def api_protocol_map() -> dict[Api, Any]:
        Api.files: Files,
    }

+    if external_apis:
+        for api, api_spec in external_apis.items():
+            try:
+                module = importlib.import_module(api_spec.module)
+                api_class = getattr(module, api_spec.protocol)

-def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
+                protocols[api] = api_class
+            except (ImportError, AttributeError):
+                logger.exception(f"Failed to load external API {api_spec.name}")
+
+    return protocols
+
+
+def api_protocol_map_for_compliance_check(config: Any) -> dict[Api, Any]:
+    external_apis = load_external_apis(config)
    return {
-        **api_protocol_map(),
+        **api_protocol_map(external_apis),
        Api.inference: InferenceProvider,
    }

@ -250,7 +273,7 @@ async def instantiate_providers(
    dist_registry: DistributionRegistry,
    run_config: StackRunConfig,
    policy: list[AccessRule],
-) -> dict:
+) -> dict[Api, Any]:
    """Instantiates providers asynchronously while managing dependencies."""
    impls: dict[Api, Any] = {}
    inner_impls_by_provider_id: dict[str, dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
@ -322,7 +345,7 @@ async def instantiate_provider(
    policy: list[AccessRule],
 ):
    provider_spec = provider.spec
-    if not hasattr(provider_spec, "module"):
+    if not hasattr(provider_spec, "module") or provider_spec.module is None:
        raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")

    logger.debug(f"Instantiating provider {provider.provider_id} from {provider_spec.module}")
@ -360,7 +383,7 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

-    protocols = api_protocol_map_for_compliance_check()
+    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
    # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
--- a/llama_stack/distribution/routing_tables/common.py
+++ b/llama_stack/distribution/routing_tables/common.py
@ -117,6 +117,9 @@ class CommonRoutingTableImpl(RoutingTable):
        for p in self.impls_by_provider_id.values():
            await p.shutdown()

+    async def refresh(self) -> None:
+        pass
+
    async def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
        from .benchmarks import BenchmarksRoutingTable
        from .datasets import DatasetsRoutingTable
@ -206,7 +209,6 @@ class CommonRoutingTableImpl(RoutingTable):
        if obj.type == ResourceType.model.value:
            await self.dist_registry.register(registered_obj)
            return registered_obj
-
        else:
            await self.dist_registry.register(obj)
            return obj
--- a/llama_stack/distribution/routing_tables/models.py
+++ b/llama_stack/distribution/routing_tables/models.py
@ -10,6 +10,7 @@ from typing import Any
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
 from llama_stack.distribution.datatypes import (
    ModelWithOwner,
+    RegistryEntrySource,
 )
 from llama_stack.log import get_logger

@ -19,6 +20,27 @@ logger = get_logger(name=__name__, category="core")


 class ModelsRoutingTable(CommonRoutingTableImpl, Models):
+    listed_providers: set[str] = set()
+
+    async def refresh(self) -> None:
+        for provider_id, provider in self.impls_by_provider_id.items():
+            refresh = await provider.should_refresh_models()
+            refresh = refresh or provider_id not in self.listed_providers
+            if not refresh:
+                continue
+
+            try:
+                models = await provider.list_models()
+            except Exception as e:
+                logger.exception(f"Model refresh failed for provider {provider_id}: {e}")
+                continue
+
+            self.listed_providers.add(provider_id)
+            if models is None:
+                continue
+
+            await self.update_registered_models(provider_id, models)
+
    async def list_models(self) -> ListModelsResponse:
        return ListModelsResponse(data=await self.get_all_with_type("model"))

@ -81,6 +103,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
            provider_id=provider_id,
            metadata=metadata,
            model_type=model_type,
+            source=RegistryEntrySource.via_register_api,
        )
        registered_model = await self.register_object(model)
        return registered_model
@ -91,7 +114,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
            raise ValueError(f"Model {model_id} not found")
        await self.unregister_object(existing_model)

-    async def update_registered_llm_models(
+    async def update_registered_models(
        self,
        provider_id: str,
        models: list[Model],
@ -102,18 +125,22 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
        # from run.yaml) that we need to keep track of
        model_ids = {}
        for model in existing_models:
-            # we leave embeddings models alone because often we don't get metadata
-            # (embedding dimension, etc.) from the provider
-            if model.provider_id == provider_id and model.model_type == ModelType.llm:
+            if model.provider_id != provider_id:
+                continue
+            if model.source == RegistryEntrySource.via_register_api:
                model_ids[model.provider_resource_id] = model.identifier
-                logger.debug(f"unregistering model {model.identifier}")
-                await self.unregister_object(model)
+                continue
+
+            logger.debug(f"unregistering model {model.identifier}")
+            await self.unregister_object(model)

        for model in models:
-            if model.model_type != ModelType.llm:
-                continue
            if model.provider_resource_id in model_ids:
-                model.identifier = model_ids[model.provider_resource_id]
+                # avoid overwriting a non-provider-registered model entry
+                continue
+
+            if model.identifier == model.provider_resource_id:
+                model.identifier = f"{provider_id}/{model.provider_resource_id}"

            logger.debug(f"registering model {model.identifier} ({model.provider_resource_id})")
            await self.register_object(
@ -123,5 +150,6 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
                    provider_id=provider_id,
                    metadata=model.metadata,
                    model_type=model.model_type,
+                    source=RegistryEntrySource.listed_from_provider,
                )
            )
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@ -7,9 +7,12 @@
 import json

 import httpx
+from aiohttp import hdrs

-from llama_stack.distribution.datatypes import AuthenticationConfig
+from llama_stack.distribution.datatypes import AuthenticationConfig, User
+from llama_stack.distribution.request_headers import user_from_scope
 from llama_stack.distribution.server.auth_providers import create_auth_provider
+from llama_stack.distribution.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")
@ -78,12 +81,14 @@ class AuthenticationMiddleware:
    access resources that don't have access_attributes defined.
    """

-    def __init__(self, app, auth_config: AuthenticationConfig):
+    def __init__(self, app, auth_config: AuthenticationConfig, impls):
        self.app = app
+        self.impls = impls
        self.auth_provider = create_auth_provider(auth_config)

    async def __call__(self, scope, receive, send):
        if scope["type"] == "http":
+            # First, handle authentication
            headers = dict(scope.get("headers", []))
            auth_header = headers.get(b"authorization", b"").decode()

@ -121,15 +126,50 @@ class AuthenticationMiddleware:
                f"Authentication successful: {validation_result.principal} with {len(validation_result.attributes)} attributes"
            )

+            # Scope-based API access control
+            path = scope.get("path", "")
+            method = scope.get("method", hdrs.METH_GET)
+
+            if not hasattr(self, "route_impls"):
+                self.route_impls = initialize_route_impls(self.impls)
+
+            try:
+                _, _, _, webmethod = find_matching_route(method, path, self.route_impls)
+            except ValueError:
+                # If no matching endpoint is found, pass through to FastAPI
+                return await self.app(scope, receive, send)
+
+            if webmethod.required_scope:
+                user = user_from_scope(scope)
+                if not _has_required_scope(webmethod.required_scope, user):
+                    return await self._send_auth_error(
+                        send,
+                        f"Access denied: user does not have required scope: {webmethod.required_scope}",
+                        status=403,
+                    )
+
        return await self.app(scope, receive, send)

-    async def _send_auth_error(self, send, message):
+    async def _send_auth_error(self, send, message, status=401):
        await send(
            {
                "type": "http.response.start",
-                "status": 401,
+                "status": status,
                "headers": [[b"content-type", b"application/json"]],
            }
        )
-        error_msg = json.dumps({"error": {"message": message}}).encode()
+        error_key = "message" if status == 401 else "detail"
+        error_msg = json.dumps({"error": {error_key: message}}).encode()
        await send({"type": "http.response.body", "body": error_msg})
+
+
+def _has_required_scope(required_scope: str, user: User | None) -> bool:
+    # if no user, assume auth is not enabled
+    if not user:
+        return True
+
+    if not user.attributes:
+        return False
+
+    user_scopes = user.attributes.get("scopes", [])
+    return required_scope in user_scopes
--- a/llama_stack/distribution/server/routes.py
+++ b/llama_stack/distribution/server/routes.py
@ -12,17 +12,18 @@ from typing import Any
 from aiohttp import hdrs
 from starlette.routing import Route

+from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from llama_stack.distribution.resolver import api_protocol_map
-from llama_stack.providers.datatypes import Api
+from llama_stack.schema_utils import WebMethod

 EndpointFunc = Callable[..., Any]
 PathParams = dict[str, str]
-RouteInfo = tuple[EndpointFunc, str]
+RouteInfo = tuple[EndpointFunc, str, WebMethod]
 PathImpl = dict[str, RouteInfo]
 RouteImpls = dict[str, PathImpl]
-RouteMatch = tuple[EndpointFunc, PathParams, str]
+RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]


 def toolgroup_protocol_map():
@ -31,10 +32,12 @@ def toolgroup_protocol_map():
    }


-def get_all_api_routes() -> dict[Api, list[Route]]:
+def get_all_api_routes(
+    external_apis: dict[Api, ExternalApiSpec] | None = None,
+) -> dict[Api, list[tuple[Route, WebMethod]]]:
    apis = {}

-    protocols = api_protocol_map()
+    protocols = api_protocol_map(external_apis)
    toolgroup_protocols = toolgroup_protocol_map()
    for api, protocol in protocols.items():
        routes = []
@ -65,7 +68,7 @@ def get_all_api_routes() -> dict[Api, list[Route]]:
            else:
                http_method = hdrs.METH_POST
            routes.append(
-                Route(path=path, methods=[http_method], name=name, endpoint=None)
+                (Route(path=path, methods=[http_method], name=name, endpoint=None), webmethod)
            )  # setting endpoint to None since don't use a Router object

        apis[api] = routes
@ -73,8 +76,8 @@ def get_all_api_routes() -> dict[Api, list[Route]]:
    return apis


-def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:
-    routes = get_all_api_routes()
+def initialize_route_impls(impls, external_apis: dict[Api, ExternalApiSpec] | None = None) -> RouteImpls:
+    api_to_routes = get_all_api_routes(external_apis)
    route_impls: RouteImpls = {}

    def _convert_path_to_regex(path: str) -> str:
@ -88,10 +91,10 @@ def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:

        return f"^{pattern}$"

-    for api, api_routes in routes.items():
+    for api, api_routes in api_to_routes.items():
        if api not in impls:
            continue
-        for route in api_routes:
+        for route, webmethod in api_routes:
            impl = impls[api]
            func = getattr(impl, route.name)
            # Get the first (and typically only) method from the set, filtering out HEAD
@ -104,6 +107,7 @@ def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:
            route_impls[method][_convert_path_to_regex(route.path)] = (
                func,
                route.path,
+                webmethod,
            )

    return route_impls
@ -118,7 +122,7 @@ def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> Rout
        route_impls: A dictionary of endpoint implementations

    Returns:
-        A tuple of (endpoint_function, path_params, descriptive_name)
+        A tuple of (endpoint_function, path_params, route_path, webmethod_metadata)

    Raises:
        ValueError: If no matching endpoint is found
@ -127,11 +131,11 @@ def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> Rout
    if not impls:
        raise ValueError(f"No endpoint found for {path}")

-    for regex, (func, descriptive_name) in impls.items():
+    for regex, (func, route_path, webmethod) in impls.items():
        match = re.match(regex, path)
        if match:
            # Extract named groups from the regex match
            path_params = match.groupdict()
-            return func, path_params, descriptive_name
+            return func, path_params, route_path, webmethod

    raise ValueError(f"No endpoint found for {path}")
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -40,7 +40,12 @@ from llama_stack.distribution.datatypes import (
    StackRunConfig,
 )
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
-from llama_stack.distribution.request_headers import PROVIDER_DATA_VAR, User, request_provider_data_context
+from llama_stack.distribution.external import ExternalApiSpec, load_external_apis
+from llama_stack.distribution.request_headers import (
+    PROVIDER_DATA_VAR,
+    request_provider_data_context,
+    user_from_scope,
+)
 from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.server.routes import (
    find_matching_route,
@ -51,6 +56,7 @@ from llama_stack.distribution.stack import (
    cast_image_name_to_string,
    construct_stack,
    replace_env_vars,
+    shutdown_stack,
    validate_env_pair,
 )
 from llama_stack.distribution.utils.config import redact_sensitive_fields
@ -146,18 +152,7 @@ async def shutdown(app):
    Handled by the lifespan context manager. The shutdown process involves
    shutting down all implementations registered in the application.
    """
-    for impl in app.__llama_stack_impls__.values():
-        impl_name = impl.__class__.__name__
-        logger.info("Shutting down %s", impl_name)
-        try:
-            if hasattr(impl, "shutdown"):
-                await asyncio.wait_for(impl.shutdown(), timeout=5)
-            else:
-                logger.warning("No shutdown method for %s", impl_name)
-        except TimeoutError:
-            logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
-        except (Exception, asyncio.CancelledError) as e:
-            logger.exception("Failed to shutdown %s: %s", impl_name, {e})
+    await shutdown_stack(app.__llama_stack_impls__)


@asynccontextmanager
@ -222,9 +217,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
    @functools.wraps(func)
    async def route_handler(request: Request, **kwargs):
        # Get auth attributes from the request scope
-        user_attributes = request.scope.get("user_attributes", {})
-        principal = request.scope.get("principal", "")
-        user = User(principal=principal, attributes=user_attributes)
+        user = user_from_scope(request.scope)

        await log_request_pre_validation(request)

@ -282,9 +275,10 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:


 class TracingMiddleware:
-    def __init__(self, app, impls):
+    def __init__(self, app, impls, external_apis: dict[str, ExternalApiSpec]):
        self.app = app
        self.impls = impls
+        self.external_apis = external_apis
        # FastAPI built-in paths that should bypass custom routing
        self.fastapi_paths = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static")

@ -301,10 +295,12 @@ class TracingMiddleware:
            return await self.app(scope, receive, send)

        if not hasattr(self, "route_impls"):
-            self.route_impls = initialize_route_impls(self.impls)
+            self.route_impls = initialize_route_impls(self.impls, self.external_apis)

        try:
-            _, _, trace_path = find_matching_route(scope.get("method", hdrs.METH_GET), path, self.route_impls)
+            _, _, route_path, webmethod = find_matching_route(
+                scope.get("method", hdrs.METH_GET), path, self.route_impls
+            )
        except ValueError:
            # If no matching endpoint is found, pass through to FastAPI
            logger.debug(f"No matching route found for path: {path}, falling back to FastAPI")
@ -321,6 +317,7 @@ class TracingMiddleware:
        if tracestate:
            trace_attributes["tracestate"] = tracestate

+        trace_path = webmethod.descriptive_name or route_path
        trace_context = await start_trace(trace_path, trace_attributes)

        async def send_with_trace_id(message):
@ -432,10 +429,21 @@ def main(args: argparse.Namespace | None = None):
    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
        app.add_middleware(ClientVersionMiddleware)

-    # Add authentication middleware if configured
+    try:
+        # Create and set the event loop that will be used for both construction and server runtime
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        # Construct the stack in the persistent event loop
+        impls = loop.run_until_complete(construct_stack(config))
+
+    except InvalidProviderError as e:
+        logger.error(f"Error: {str(e)}")
+        sys.exit(1)
+
    if config.server.auth:
        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
-        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
+        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth, impls=impls)
    else:
        if config.server.quota:
            quota = config.server.quota
@ -466,24 +474,14 @@ def main(args: argparse.Namespace | None = None):
            window_seconds=window_seconds,
        )

-    try:
-        # Create and set the event loop that will be used for both construction and server runtime
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-
-        # Construct the stack in the persistent event loop
-        impls = loop.run_until_complete(construct_stack(config))
-
-    except InvalidProviderError as e:
-        logger.error(f"Error: {str(e)}")
-        sys.exit(1)
-
    if Api.telemetry in impls:
        setup_logger(impls[Api.telemetry])
    else:
        setup_logger(TelemetryAdapter(TelemetryConfig(), {}))

-    all_routes = get_all_api_routes()
+    # Load external APIs if configured
+    external_apis = load_external_apis(config)
+    all_routes = get_all_api_routes(external_apis)

    if config.apis:
        apis_to_serve = set(config.apis)
@ -502,9 +500,12 @@ def main(args: argparse.Namespace | None = None):
        api = Api(api_str)

        routes = all_routes[api]
-        impl = impls[api]
+        try:
+            impl = impls[api]
+        except KeyError as e:
+            raise ValueError(f"Could not find provider implementation for {api} API") from e

-        for route in routes:
+        for route, _ in routes:
            if not hasattr(impl, route.name):
                # ideally this should be a typing violation already
                raise ValueError(f"Could not find method {route.name} on {impl}!")
@ -533,7 +534,7 @@ def main(args: argparse.Namespace | None = None):
    app.exception_handler(Exception)(global_exception_handler)

    app.__llama_stack_impls__ = impls
-    app.add_middleware(TracingMiddleware, impls=impls)
+    app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)

    import uvicorn

@ -610,11 +611,8 @@ def extract_path_params(route: str) -> list[str]:

 def remove_disabled_providers(obj):
    if isinstance(obj, dict):
-        if (
-            obj.get("provider_id") == "__disabled__"
-            or obj.get("shield_id") == "__disabled__"
-            or obj.get("provider_model_id") == "__disabled__"
-        ):
+        keys = ["provider_id", "shield_id", "provider_model_id", "model_id"]
+        if any(k in obj and obj[k] in ("__disabled__", "", None) for k in keys):
            return None
        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
    elif isinstance(obj, list):
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import importlib.resources
 import os
 import re
@ -38,6 +39,7 @@ from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.inspect import DistributionInspectConfig, DistributionInspectImpl
 from llama_stack.distribution.providers import ProviderImpl, ProviderImplConfig
 from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls
+from llama_stack.distribution.routing_tables.common import CommonRoutingTableImpl
 from llama_stack.distribution.store.registry import create_dist_registry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
@ -90,6 +92,10 @@ RESOURCES = [
 ]


+REGISTRY_REFRESH_INTERVAL_SECONDS = 300
+REGISTRY_REFRESH_TASK = None
+
+
 async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
    for rsrc, api, register_method, list_method in RESOURCES:
        objects = getattr(run_config, rsrc)
@ -99,23 +105,10 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
        method = getattr(impls[api], register_method)
        for obj in objects:
            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
-            # Do not register models on disabled providers
-            if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__":
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
-                continue
-            # In complex templates, like our starter template, we may have dynamic model ids
-            # given by environment variables. This allows those environment variables to have
-            # a default value of __disabled__ to skip registration of the model if not set.
-            if (
-                hasattr(obj, "provider_model_id")
-                and obj.provider_model_id is not None
-                and "__disabled__" in obj.provider_model_id
-            ):
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.")
-                continue

-            if hasattr(obj, "shield_id") and obj.shield_id is not None and obj.shield_id == "__disabled__":
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled shield.")
+            # Do not register models on disabled providers
+            if hasattr(obj, "provider_id") and (not obj.provider_id or obj.provider_id == "__disabled__"):
+                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
                continue

            # we want to maintain the type information in arguments to method.
@ -324,9 +317,61 @@ async def construct_stack(
    add_internal_implementations(impls, run_config)

    await register_resources(run_config, impls)
+
+    await refresh_registry_once(impls)
+
+    global REGISTRY_REFRESH_TASK
+    REGISTRY_REFRESH_TASK = asyncio.create_task(refresh_registry_task(impls))
+
+    def cb(task):
+        import traceback
+
+        if task.cancelled():
+            logger.error("Model refresh task cancelled")
+        elif task.exception():
+            logger.error(f"Model refresh task failed: {task.exception()}")
+            traceback.print_exception(task.exception())
+        else:
+            logger.debug("Model refresh task completed")
+
+    REGISTRY_REFRESH_TASK.add_done_callback(cb)
    return impls


+async def shutdown_stack(impls: dict[Api, Any]):
+    for impl in impls.values():
+        impl_name = impl.__class__.__name__
+        logger.info(f"Shutting down {impl_name}")
+        try:
+            if hasattr(impl, "shutdown"):
+                await asyncio.wait_for(impl.shutdown(), timeout=5)
+            else:
+                logger.warning(f"No shutdown method for {impl_name}")
+        except TimeoutError:
+            logger.exception(f"Shutdown timeout for {impl_name}")
+        except (Exception, asyncio.CancelledError) as e:
+            logger.exception(f"Failed to shutdown {impl_name}: {e}")
+
+    global REGISTRY_REFRESH_TASK
+    if REGISTRY_REFRESH_TASK:
+        REGISTRY_REFRESH_TASK.cancel()
+
+
+async def refresh_registry_once(impls: dict[Api, Any]):
+    logger.info("refreshing registry")
+    routing_tables = [v for v in impls.values() if isinstance(v, CommonRoutingTableImpl)]
+    for routing_table in routing_tables:
+        await routing_table.refresh()
+
+
+async def refresh_registry_task(impls: dict[Api, Any]):
+    logger.info("starting registry refresh task")
+    while True:
+        await refresh_registry_once(impls)
+
+        await asyncio.sleep(REGISTRY_REFRESH_INTERVAL_SECONDS)
+
+
 def get_stack_run_config_from_template(template: str) -> StackRunConfig:
    template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"

--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@ -6,6 +6,7 @@

 import logging
 import os
+import re
 import sys
 from logging.config import dictConfig

@ -30,6 +31,7 @@ CATEGORIES = [
    "eval",
    "tools",
    "client",
+    "telemetry",
 ]

 # Initialize category levels with default level
@ -113,6 +115,11 @@ def parse_environment_config(env_config: str) -> dict[str, int]:
    return category_levels


+def strip_rich_markup(text):
+    """Remove Rich markup tags like [dim], [bold magenta], etc."""
+    return re.sub(r"\[/?[a-zA-Z0-9 _#=,]+\]", "", text)
+
+
 class CustomRichHandler(RichHandler):
    def __init__(self, *args, **kwargs):
        kwargs["console"] = Console(width=150)
@ -131,6 +138,19 @@ class CustomRichHandler(RichHandler):
                self.markup = original_markup


+class CustomFileHandler(logging.FileHandler):
+    def __init__(self, filename, mode="a", encoding=None, delay=False):
+        super().__init__(filename, mode, encoding, delay)
+        # Default formatter to match console output
+        self.default_formatter = logging.Formatter("%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s")
+        self.setFormatter(self.default_formatter)
+
+    def emit(self, record):
+        if hasattr(record, "msg"):
+            record.msg = strip_rich_markup(str(record.msg))
+        super().emit(record)
+
+
 def setup_logging(category_levels: dict[str, int], log_file: str | None) -> None:
    """
    Configure logging based on the provided category log levels and an optional log file.
@ -167,8 +187,7 @@ def setup_logging(category_levels: dict[str, int], log_file: str | None) -> None
    # Add a file handler if log_file is set
    if log_file:
        handlers["file"] = {
-            "class": "logging.FileHandler",
-            "formatter": "rich",
+            "()": CustomFileHandler,
            "filename": log_file,
            "mode": "a",
            "encoding": "utf-8",
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -43,10 +43,24 @@ class ModelsProtocolPrivate(Protocol):
       -> Provider uses provider-model-id for inference
    """

+    # this should be called `on_model_register` or something like that.
+    # the provider should _not_ be able to change the object in this
+    # callback
    async def register_model(self, model: Model) -> Model: ...

    async def unregister_model(self, model_id: str) -> None: ...

+    # the Stack router will query each provider for their list of models
+    # if a `refresh_interval_seconds` is provided, this method will be called
+    # periodically to refresh the list of models
+    #
+    # NOTE: each model returned will be registered with the model registry. this means
+    # a callback to the `register_model()` method will be made. this is duplicative and
+    # may be removed in the future.
+    async def list_models(self) -> list[Model] | None: ...
+
+    async def should_refresh_models(self) -> bool: ...
+

 class ShieldsProtocolPrivate(Protocol):
    async def register_shield(self, shield: Shield) -> None: ...
@ -104,6 +118,19 @@ class ProviderSpec(BaseModel):
        description="If this provider is deprecated and does NOT work, specify the error message here",
    )

+    module: str | None = Field(
+        default=None,
+        description="""
+ Fully-qualified name of the module to import. The module is expected to have:
+
+  - `get_adapter_impl(config, deps)`: returns the adapter implementation
+
+  Example: `module: ramalama_stack`
+ """,
+    )
+
+    is_external: bool = Field(default=False, description="Notes whether this provider is an external provider.")
+
    # used internally by the resolver; this is a hack for now
    deps__: list[str] = Field(default_factory=list)

@ -124,7 +151,7 @@ class AdapterSpec(BaseModel):
        description="Unique identifier for this adapter",
    )
    module: str = Field(
-        ...,
+        default_factory=str,
        description="""
 Fully-qualified name of the module to import. The module is expected to have:

@ -162,14 +189,7 @@ The container image to use for this implementation. If one is provided, pip_pack
 If a provider depends on other providers, the dependencies MUST NOT specify a container image.
 """,
    )
-    module: str = Field(
-        ...,
-        description="""
-Fully-qualified name of the module to import. The module is expected to have:
-
- - `get_provider_impl(config, deps)`: returns the local implementation
-""",
-    )
+    # module field is inherited from ProviderSpec
    provider_data_validator: str | None = Field(
        default=None,
    )
@ -212,9 +232,7 @@ API responses, specify the adapter here.
    def container_image(self) -> str | None:
        return None

-    @property
-    def module(self) -> str:
-        return self.adapter.module
+    # module field is inherited from ProviderSpec

    @property
    def pip_packages(self) -> list[str]:
@ -226,14 +244,19 @@ API responses, specify the adapter here.


 def remote_provider_spec(
-    api: Api, adapter: AdapterSpec, api_dependencies: list[Api] | None = None
+    api: Api,
+    adapter: AdapterSpec,
+    api_dependencies: list[Api] | None = None,
+    optional_api_dependencies: list[Api] | None = None,
 ) -> RemoteProviderSpec:
    return RemoteProviderSpec(
        api=api,
        provider_type=f"remote::{adapter.adapter_type}",
        config_class=adapter.config_class,
+        module=adapter.module,
        adapter=adapter,
        api_dependencies=api_dependencies or [],
+        optional_api_dependencies=optional_api_dependencies or [],
    )


--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -102,6 +102,12 @@ class MetaReferenceInferenceImpl(
        if self.config.create_distributed_process_group:
            self.generator.stop()

+    async def should_refresh_models(self) -> bool:
+        return False
+
+    async def list_models(self) -> list[Model] | None:
+        return None
+
    async def unregister_model(self, model_id: str) -> None:
        pass

--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -20,6 +20,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.models import ModelType
 from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
@ -41,6 +42,8 @@ class SentenceTransformersInferenceImpl(
    InferenceProvider,
    ModelsProtocolPrivate,
 ):
+    __provider_id__: str
+
    def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
        self.config = config

@ -50,6 +53,22 @@ class SentenceTransformersInferenceImpl(
    async def shutdown(self) -> None:
        pass

+    async def should_refresh_models(self) -> bool:
+        return False
+
+    async def list_models(self) -> list[Model] | None:
+        return [
+            Model(
+                identifier="all-MiniLM-L6-v2",
+                provider_resource_id="all-MiniLM-L6-v2",
+                provider_id=self.__provider_id__,
+                metadata={
+                    "embedding_dimension": 384,
+                },
+                model_type=ModelType.embedding,
+            ),
+        ]
+
    async def register_model(self, model: Model) -> Model:
        return model

--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -146,9 +146,9 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
        pass

    async def register_shield(self, shield: Shield) -> None:
-        # Allow any model to be registered as a shield
-        # The model will be validated during runtime when making inference calls
-        pass
+        model_id = shield.provider_resource_id
+        if not model_id:
+            raise ValueError("Llama Guard shield must have a model id")

    async def run_shield(
        self,
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@ -11,19 +11,9 @@ from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanProcessor
 from opentelemetry.trace.status import StatusCode

-# Colors for console output
-COLORS = {
-    "reset": "\033[0m",
-    "bold": "\033[1m",
-    "dim": "\033[2m",
-    "red": "\033[31m",
-    "green": "\033[32m",
-    "yellow": "\033[33m",
-    "blue": "\033[34m",
-    "magenta": "\033[35m",
-    "cyan": "\033[36m",
-    "white": "\033[37m",
-}
+from llama_stack.log import get_logger
+
+logger = get_logger(name="console_span_processor", category="telemetry")


 class ConsoleSpanProcessor(SpanProcessor):
@ -35,34 +25,21 @@ class ConsoleSpanProcessor(SpanProcessor):
            return

        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
-
-        print(
-            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
-            f"{COLORS['magenta']}[START]{COLORS['reset']} "
-            f"{COLORS['dim']}{span.name}{COLORS['reset']}"
-        )
+        logger.info(f"[dim]{timestamp}[/dim] [bold magenta][START][/bold magenta] [dim]{span.name}[/dim]")

    def on_end(self, span: ReadableSpan) -> None:
        if span.attributes and span.attributes.get("__autotraced__"):
            return

        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
-
-        span_context = (
-            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
-            f"{COLORS['magenta']}[END]{COLORS['reset']} "
-            f"{COLORS['dim']}{span.name}{COLORS['reset']}"
-        )
-
+        span_context = f"[dim]{timestamp}[/dim] [bold magenta][END][/bold magenta] [dim]{span.name}[/dim]"
        if span.status.status_code == StatusCode.ERROR:
-            span_context += f"{COLORS['reset']} {COLORS['red']}[ERROR]{COLORS['reset']}"
+            span_context += " [bold red][ERROR][/bold red]"
        elif span.status.status_code != StatusCode.UNSET:
-            span_context += f"{COLORS['reset']} [{span.status.status_code}]"
-
+            span_context += f" [{span.status.status_code}]"
        duration_ms = (span.end_time - span.start_time) / 1e6
-        span_context += f"{COLORS['reset']} ({duration_ms:.2f}ms)"
-
-        print(span_context)
+        span_context += f" ({duration_ms:.2f}ms)"
+        logger.info(span_context)

        if self.print_attributes and span.attributes:
            for key, value in span.attributes.items():
@ -71,31 +48,26 @@ class ConsoleSpanProcessor(SpanProcessor):
                str_value = str(value)
                if len(str_value) > 1000:
                    str_value = str_value[:997] + "..."
-                print(f"    {COLORS['dim']}{key}: {str_value}{COLORS['reset']}")
+                logger.info(f"    [dim]{key}[/dim]: {str_value}")

        for event in span.events:
            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3]
-
            severity = event.attributes.get("severity", "info")
            message = event.attributes.get("message", event.name)
-            if isinstance(message, dict | list):
+            if isinstance(message, dict) or isinstance(message, list):
                message = json.dumps(message, indent=2)
-
-            severity_colors = {
-                "error": f"{COLORS['bold']}{COLORS['red']}",
-                "warn": f"{COLORS['bold']}{COLORS['yellow']}",
-                "info": COLORS["white"],
-                "debug": COLORS["dim"],
-            }
-            msg_color = severity_colors.get(severity, COLORS["white"])
-
-            print(f" {event_time} {msg_color}[{severity.upper()}] {message}{COLORS['reset']}")
-
+            severity_color = {
+                "error": "red",
+                "warn": "yellow",
+                "info": "white",
+                "debug": "dim",
+            }.get(severity, "white")
+            logger.info(f" {event_time} [bold {severity_color}][{severity.upper()}][/bold {severity_color}] {message}")
            if event.attributes:
                for key, value in event.attributes.items():
                    if key.startswith("__") or key in ["message", "severity"]:
                        continue
-                    print(f"   {COLORS['dim']}{key}: {value}{COLORS['reset']}")
+                    logger.info(f"/r[dim]{key}[/dim]: {value}")

    def shutdown(self) -> None:
        """Shutdown the processor."""
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -55,6 +55,11 @@ class FaissIndex(EmbeddingIndex):
        self.kvstore = kvstore
        self.bank_id = bank_id

+        # A list of chunk id's in the same order as they are in the index,
+        # must be updated when chunks are added or removed
+        self.chunk_id_lock = asyncio.Lock()
+        self.chunk_ids: list[Any] = []
+
    @classmethod
    async def create(cls, dimension: int, kvstore: KVStore | None = None, bank_id: str | None = None):
        instance = cls(dimension, kvstore, bank_id)
@ -75,6 +80,7 @@ class FaissIndex(EmbeddingIndex):
            buffer = io.BytesIO(base64.b64decode(data["faiss_index"]))
            try:
                self.index = faiss.deserialize_index(np.load(buffer, allow_pickle=False))
+                self.chunk_ids = [chunk.chunk_id for chunk in self.chunk_by_index.values()]
            except Exception as e:
                logger.debug(e, exc_info=True)
                raise ValueError(
@ -114,11 +120,33 @@ class FaissIndex(EmbeddingIndex):
        for i, chunk in enumerate(chunks):
            self.chunk_by_index[indexlen + i] = chunk

-        self.index.add(np.array(embeddings).astype(np.float32))
+        async with self.chunk_id_lock:
+            self.index.add(np.array(embeddings).astype(np.float32))
+            self.chunk_ids.extend([chunk.chunk_id for chunk in chunks])

        # Save updated index
        await self._save_index()

+    async def delete_chunk(self, chunk_id: str) -> None:
+        if chunk_id not in self.chunk_ids:
+            return
+
+        async with self.chunk_id_lock:
+            index = self.chunk_ids.index(chunk_id)
+            self.index.remove_ids(np.array([index]))
+
+            new_chunk_by_index = {}
+            for idx, chunk in self.chunk_by_index.items():
+                # Shift all chunks after the removed chunk to the left
+                if idx > index:
+                    new_chunk_by_index[idx - 1] = chunk
+                else:
+                    new_chunk_by_index[idx] = chunk
+            self.chunk_by_index = new_chunk_by_index
+            self.chunk_ids.pop(index)
+
+        await self._save_index()
+
    async def query_vector(
        self,
        embedding: NDArray,
@ -260,3 +288,9 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
            raise ValueError(f"Vector DB {vector_db_id} not found")

        return await index.query_chunks(query, params)
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        """Delete a chunk from a faiss index"""
+        faiss_index = self.cache[store_id].index
+        for chunk_id in chunk_ids:
+            await faiss_index.delete_chunk(chunk_id)
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -425,6 +425,35 @@ class SQLiteVecIndex(EmbeddingIndex):

        return QueryChunksResponse(chunks=chunks, scores=scores)

+    async def delete_chunk(self, chunk_id: str) -> None:
+        """Remove a chunk from the SQLite vector store."""
+
+        def _delete_chunk():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute("BEGIN TRANSACTION")
+
+                # Delete from metadata table
+                cur.execute(f"DELETE FROM {self.metadata_table} WHERE id = ?", (chunk_id,))
+
+                # Delete from vector table
+                cur.execute(f"DELETE FROM {self.vector_table} WHERE id = ?", (chunk_id,))
+
+                # Delete from FTS table
+                cur.execute(f"DELETE FROM {self.fts_table} WHERE id = ?", (chunk_id,))
+
+                connection.commit()
+            except Exception as e:
+                connection.rollback()
+                logger.error(f"Error deleting chunk {chunk_id}: {e}")
+                raise
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_delete_chunk)
+

 class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    """
@ -520,3 +549,13 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
        if not index:
            raise ValueError(f"Vector DB {vector_db_id} not found")
        return await index.query_chunks(query, params)
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        """Delete a chunk from a sqlite_vec index."""
+        index = await self._get_and_cache_vector_db_index(store_id)
+        if not index:
+            raise ValueError(f"Vector DB {store_id} not found")
+
+        for chunk_id in chunk_ids:
+            # Use the index's delete_chunk method
+            await index.index.delete_chunk(chunk_id)
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -410,6 +410,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 """,
            ),
            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
        ),
        remote_provider_spec(
            Api.vector_io,
--- a/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -15,6 +15,7 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="anthropic",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="anthropic_api_key",
        )
--- a/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/llama_stack/providers/remote/inference/anthropic/config.py
@ -26,7 +26,7 @@ class AnthropicConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/anthropic/models.py
+++ b/llama_stack/providers/remote/inference/anthropic/models.py
@ -10,9 +10,9 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "anthropic/claude-3-5-sonnet-latest",
-    "anthropic/claude-3-7-sonnet-latest",
-    "anthropic/claude-3-5-haiku-latest",
+    "claude-3-5-sonnet-latest",
+    "claude-3-7-sonnet-latest",
+    "claude-3-5-haiku-latest",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -21,17 +21,17 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3",
+            provider_model_id="voyage-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3-lite",
+            provider_model_id="voyage-3-lite",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 512, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-code-3",
+            provider_model_id="voyage-code-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -63,18 +63,20 @@ class BedrockInferenceAdapter(
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self._config = config
-
-        self._client = create_bedrock_client(config)
+        self._client = None

    @property
    def client(self) -> BaseClient:
+        if self._client is None:
+            self._client = create_bedrock_client(self._config)
        return self._client

    async def initialize(self) -> None:
        pass

    async def shutdown(self) -> None:
-        self.client.close()
+        if self._client is not None:
+            self._client.close()

    async def completion(
        self,
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -65,6 +65,7 @@ class CerebrasInferenceAdapter(
        )
        self.config = config

+        # TODO: make this use provider data, etc. like other providers
        self.client = AsyncCerebras(
            base_url=self.config.base_url,
            api_key=self.config.api_key.get_secret_value(),
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -26,7 +26,7 @@ class CerebrasImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "base_url": DEFAULT_BASE_URL,
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -25,8 +25,8 @@ class DatabricksImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.DATABRICKS_URL}",
-        api_token: str = "${env.DATABRICKS_API_TOKEN}",
+        url: str = "${env.DATABRICKS_URL:=}",
+        api_token: str = "${env.DATABRICKS_API_TOKEN:=}",
        **kwargs: Any,
    ) -> dict[str, Any]:
        return {
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -6,13 +6,14 @@

 from typing import Any

-from pydantic import BaseModel, Field, SecretStr
+from pydantic import Field, SecretStr

+from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.schema_utils import json_schema_type


@json_schema_type
-class FireworksImplConfig(BaseModel):
+class FireworksImplConfig(RemoteInferenceProviderConfig):
    url: str = Field(
        default="https://api.fireworks.ai/inference/v1",
        description="The URL for the Fireworks server",
@ -23,7 +24,7 @@ class FireworksImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -70,7 +70,7 @@ logger = get_logger(name=__name__, category="inference")

 class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
+        ModelRegistryHelper.__init__(self, MODEL_ENTRIES, config.allowed_models)
        self.config = config

    async def initialize(self) -> None:
--- a/llama_stack/providers/remote/inference/gemini/config.py
+++ b/llama_stack/providers/remote/inference/gemini/config.py
@ -26,7 +26,7 @@ class GeminiConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -15,6 +15,7 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="gemini",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="gemini_api_key",
        )
--- a/llama_stack/providers/remote/inference/gemini/models.py
+++ b/llama_stack/providers/remote/inference/gemini/models.py
@ -10,11 +10,11 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "gemini/gemini-1.5-flash",
-    "gemini/gemini-1.5-pro",
-    "gemini/gemini-2.0-flash",
-    "gemini/gemini-2.5-flash",
-    "gemini/gemini-2.5-pro",
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
+    "gemini-2.0-flash",
+    "gemini-2.5-flash",
+    "gemini-2.5-pro",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -23,7 +23,7 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="gemini/text-embedding-004",
+            provider_model_id="text-embedding-004",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 768, "context_length": 2048},
        ),
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@ -32,7 +32,7 @@ class GroqConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.groq.com",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -34,6 +34,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="groq",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="groq_api_key",
        )
@ -96,7 +97,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
            tool_choice = "required"

        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id.replace("groq/", ""),
+            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@ -14,19 +14,19 @@ SAFETY_MODELS_ENTRIES = []

 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "groq/llama3-8b-8192",
+        "llama3-8b-8192",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_model_entry(
-        "groq/llama-3.1-8b-instant",
+        "llama-3.1-8b-instant",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama3-70b-8192",
+        "llama3-70b-8192",
        CoreModelId.llama3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-3.3-70b-versatile",
+        "llama-3.3-70b-versatile",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    # Groq only contains a preview version for llama-3.2-3b
@ -34,23 +34,15 @@ MODEL_ENTRIES = [
    # to pass the test fixture
    # TODO(aidand): Replace this with a stable model once Groq supports it
    build_hf_repo_model_entry(
-        "groq/llama-3.2-3b-preview",
+        "llama-3.2-3b-preview",
        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-4-scout-17b-16e-instruct",
+        "meta-llama/llama-4-scout-17b-16e-instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-scout-17b-16e-instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/llama-4-maverick-17b-128e-instruct",
-        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
+        "meta-llama/llama-4-maverick-17b-128e-instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -32,6 +32,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="llama",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="llama_api_key",
            openai_compat_api_base=config.openai_compat_api_base,
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -13,8 +13,10 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"

 class OllamaImplConfig(BaseModel):
    url: str = DEFAULT_OLLAMA_URL
-    refresh_models: bool = Field(default=False, description="refresh and re-register models periodically")
-    refresh_models_interval: int = Field(default=300, description="interval in seconds to refresh models")
+    refresh_models: bool = Field(
+        default=False,
+        description="Whether to refresh models periodically",
+    )

    @classmethod
    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -98,14 +98,16 @@ class OllamaInferenceAdapter(
    def __init__(self, config: OllamaImplConfig) -> None:
        self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
        self.config = config
-        self._client = None
+        self._clients: dict[asyncio.AbstractEventLoop, AsyncClient] = {}
        self._openai_client = None

    @property
    def client(self) -> AsyncClient:
-        if self._client is None:
-            self._client = AsyncClient(host=self.config.url)
-        return self._client
+        # ollama client attaches itself to the current event loop (sadly?)
+        loop = asyncio.get_running_loop()
+        if loop not in self._clients:
+            self._clients[loop] = AsyncClient(host=self.config.url)
+        return self._clients[loop]

    @property
    def openai_client(self) -> AsyncOpenAI:
@ -121,59 +123,61 @@ class OllamaInferenceAdapter(
                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
            )

-        if self.config.refresh_models:
-            logger.debug("ollama starting background model refresh task")
-            self._refresh_task = asyncio.create_task(self._refresh_models())
-
-            def cb(task):
-                if task.cancelled():
-                    import traceback
-
-                    logger.error(f"ollama background refresh task canceled:\n{''.join(traceback.format_stack())}")
-                elif task.exception():
-                    logger.error(f"ollama background refresh task died: {task.exception()}")
-                else:
-                    logger.error("ollama background refresh task completed unexpectedly")
-
-            self._refresh_task.add_done_callback(cb)
-
-    async def _refresh_models(self) -> None:
-        # Wait for model store to be available (with timeout)
-        waited_time = 0
-        while not self.model_store and waited_time < 60:
-            await asyncio.sleep(1)
-            waited_time += 1
-
-        if not self.model_store:
-            raise ValueError("Model store not set after waiting 60 seconds")
+    async def should_refresh_models(self) -> bool:
+        return self.config.refresh_models

+    async def list_models(self) -> list[Model] | None:
        provider_id = self.__provider_id__
-        while True:
-            try:
-                response = await self.client.list()
-            except Exception as e:
-                logger.warning(f"Failed to list models: {str(e)}")
-                await asyncio.sleep(self.config.refresh_models_interval)
+        response = await self.client.list()
+
+        # always add the two embedding models which can be pulled on demand
+        models = [
+            Model(
+                identifier="all-minilm:l6-v2",
+                provider_resource_id="all-minilm:l6-v2",
+                provider_id=provider_id,
+                metadata={
+                    "embedding_dimension": 384,
+                    "context_length": 512,
+                },
+                model_type=ModelType.embedding,
+            ),
+            # add all-minilm alias
+            Model(
+                identifier="all-minilm",
+                provider_resource_id="all-minilm:l6-v2",
+                provider_id=provider_id,
+                metadata={
+                    "embedding_dimension": 384,
+                    "context_length": 512,
+                },
+                model_type=ModelType.embedding,
+            ),
+            Model(
+                identifier="nomic-embed-text",
+                provider_resource_id="nomic-embed-text",
+                provider_id=provider_id,
+                metadata={
+                    "embedding_dimension": 768,
+                    "context_length": 8192,
+                },
+                model_type=ModelType.embedding,
+            ),
+        ]
+        for m in response.models:
+            # kill embedding models since we don't know dimensions for them
+            if "bert" in m.details.family:
                continue
-
-            models = []
-            for m in response.models:
-                model_type = ModelType.embedding if m.details.family in ["bert"] else ModelType.llm
-                if model_type == ModelType.embedding:
-                    continue
-                models.append(
-                    Model(
-                        identifier=m.model,
-                        provider_resource_id=m.model,
-                        provider_id=provider_id,
-                        metadata={},
-                        model_type=model_type,
-                    )
+            models.append(
+                Model(
+                    identifier=m.model,
+                    provider_resource_id=m.model,
+                    provider_id=provider_id,
+                    metadata={},
+                    model_type=ModelType.llm,
                )
-            await self.model_store.update_registered_llm_models(provider_id, models)
-            logger.debug(f"ollama refreshed model list ({len(models)} models)")
-
-            await asyncio.sleep(self.config.refresh_models_interval)
+            )
+        return models

    async def health(self) -> HealthResponse:
        """
@ -190,12 +194,7 @@ class OllamaInferenceAdapter(
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")

    async def shutdown(self) -> None:
-        if hasattr(self, "_refresh_task") and not self._refresh_task.done():
-            logger.debug("ollama cancelling background refresh task")
-            self._refresh_task.cancel()
-
-        self._client = None
-        self._openai_client = None
+        self._clients.clear()

    async def unregister_model(self, model_id: str) -> None:
        pass
@ -421,9 +420,6 @@ class OllamaInferenceAdapter(
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing

-        if model.provider_resource_id is None:
-            raise ValueError("Model provider_resource_id cannot be None")
-
        if model.model_type == ModelType.embedding:
            response = await self.client.list()
            if model.provider_resource_id not in [m.model for m in response.models]:
@ -434,9 +430,9 @@ class OllamaInferenceAdapter(
        #  - models not currently running are run by the ollama server as needed
        response = await self.client.list()
        available_models = [m.model for m in response.models]
-        provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
-        if provider_resource_id is None:
-            provider_resource_id = model.provider_resource_id
+
+        provider_resource_id = model.provider_resource_id
+        assert provider_resource_id is not None  # mypy
        if provider_resource_id not in available_models:
            available_models_latest = [m.model.split(":latest")[0] for m in response.models]
            if provider_resource_id in available_models_latest:
@ -444,7 +440,9 @@ class OllamaInferenceAdapter(
                    f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
                )
                return model
-            raise UnsupportedModelError(model.provider_resource_id, available_models)
+            raise UnsupportedModelError(provider_resource_id, available_models)
+
+        # mutating this should be considered an anti-pattern
        model.provider_resource_id = provider_resource_id

        return model
--- a/llama_stack/providers/remote/inference/openai/config.py
+++ b/llama_stack/providers/remote/inference/openai/config.py
@ -26,7 +26,7 @@ class OpenAIConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/openai/models.py
+++ b/llama_stack/providers/remote/inference/openai/models.py
@ -12,11 +12,6 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    # the models w/ "openai/" prefix are the litellm specific model names.
-    # they should be deprecated in favor of the canonical openai model names.
-    "openai/gpt-4o",
-    "openai/gpt-4o-mini",
-    "openai/chatgpt-4o-latest",
    "gpt-3.5-turbo-0125",
    "gpt-3.5-turbo",
    "gpt-3.5-turbo-instruct",
@ -43,8 +38,6 @@ class EmbeddingModelInfo:


 EMBEDDING_MODEL_IDS: dict[str, EmbeddingModelInfo] = {
-    "openai/text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
-    "openai/text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
    "text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
    "text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
 }
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -45,6 +45,7 @@ class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="openai",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="openai_api_key",
        )
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@ -9,49 +9,20 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

-SAFETY_MODELS_ENTRIES = [
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-]
+SAFETY_MODELS_ENTRIES = []


 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-8B-Instruct",
+        "Meta-Llama-3.1-8B-Instruct",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-405B-Instruct",
-        CoreModelId.llama3_1_405b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-1B-Instruct",
-        CoreModelId.llama3_2_1b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-3B-Instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.3-70B-Instruct",
+        "Meta-Llama-3.3-70B-Instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-11B-Vision-Instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-90B-Vision-Instruct",
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Scout-17B-16E-Instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
+        "Llama-4-Maverick-17B-128E-Instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -182,6 +182,7 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="sambanova",
            api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
            provider_data_api_key_field="sambanova_api_key",
        )
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -19,7 +19,7 @@ class TGIImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.TGI_URL}",
+        url: str = "${env.TGI_URL:=}",
        **kwargs,
    ):
        return {
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -305,6 +305,8 @@ class _HfAdapter(

 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
+        if not config.url:
+            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
        log.info(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(
            model=config.url,
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -6,13 +6,14 @@

 from typing import Any

-from pydantic import BaseModel, Field, SecretStr
+from pydantic import Field, SecretStr

+from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.schema_utils import json_schema_type


@json_schema_type
-class TogetherImplConfig(BaseModel):
+class TogetherImplConfig(RemoteInferenceProviderConfig):
    url: str = Field(
        default="https://api.together.xyz/v1",
        description="The URL for the Together AI server",
@ -26,5 +27,5 @@ class TogetherImplConfig(BaseModel):
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.together.xyz/v1",
-            "api_key": "${env.TOGETHER_API_KEY}",
+            "api_key": "${env.TOGETHER_API_KEY:=}",
        }
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -69,15 +69,9 @@ MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        ],
    ),
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        ],
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -66,7 +66,7 @@ logger = get_logger(name=__name__, category="inference")

 class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
    def __init__(self, config: TogetherImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
+        ModelRegistryHelper.__init__(self, MODEL_ENTRIES, config.allowed_models)
        self.config = config

    async def initialize(self) -> None:
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -33,10 +33,6 @@ class VLLMInferenceAdapterConfig(BaseModel):
        default=False,
        description="Whether to refresh models periodically",
    )
-    refresh_models_interval: int = Field(
-        default=300,
-        description="Interval in seconds to refresh models",
-    )

    @field_validator("tls_verify")
    @classmethod
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -3,7 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import asyncio
 import json
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
@ -293,7 +292,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    # automatically set by the resolver when instantiating the provider
    __provider_id__: str
    model_store: ModelStore | None = None
-    _refresh_task: asyncio.Task | None = None

    def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
@ -302,64 +300,32 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):

    async def initialize(self) -> None:
        if not self.config.url:
-            # intentionally don't raise an error here, we want to allow the provider to be "dormant"
-            # or available in distributions like "starter" without causing a ruckus
-            return
+            raise ValueError(
+                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
+            )

-        if self.config.refresh_models:
-            self._refresh_task = asyncio.create_task(self._refresh_models())
-
-            def cb(task):
-                import traceback
-
-                if task.cancelled():
-                    log.error(f"vLLM background refresh task canceled:\n{''.join(traceback.format_stack())}")
-                elif task.exception():
-                    # print the stack trace for the exception
-                    exc = task.exception()
-                    log.error(f"vLLM background refresh task died: {exc}")
-                    traceback.print_exception(exc)
-                else:
-                    log.error("vLLM background refresh task completed unexpectedly")
-
-            self._refresh_task.add_done_callback(cb)
-
-    async def _refresh_models(self) -> None:
-        provider_id = self.__provider_id__
-        waited_time = 0
-        while not self.model_store and waited_time < 60:
-            await asyncio.sleep(1)
-            waited_time += 1
-
-        if not self.model_store:
-            raise ValueError("Model store not set after waiting 60 seconds")
+    async def should_refresh_models(self) -> bool:
+        return self.config.refresh_models

+    async def list_models(self) -> list[Model] | None:
        self._lazy_initialize_client()
        assert self.client is not None  # mypy
-        while True:
-            try:
-                models = []
-                async for m in self.client.models.list():
-                    model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
-                    models.append(
-                        Model(
-                            identifier=m.id,
-                            provider_resource_id=m.id,
-                            provider_id=provider_id,
-                            metadata={},
-                            model_type=model_type,
-                        )
-                    )
-                await self.model_store.update_registered_llm_models(provider_id, models)
-                log.debug(f"vLLM refreshed model list ({len(models)} models)")
-            except Exception as e:
-                log.error(f"vLLM background refresh task failed: {e}")
-            await asyncio.sleep(self.config.refresh_models_interval)
+        models = []
+        async for m in self.client.models.list():
+            model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
+            models.append(
+                Model(
+                    identifier=m.id,
+                    provider_resource_id=m.id,
+                    provider_id=self.__provider_id__,
+                    metadata={},
+                    model_type=model_type,
+                )
+            )
+        return models

    async def shutdown(self) -> None:
-        if self._refresh_task:
-            self._refresh_task.cancel()
-            self._refresh_task = None
+        pass

    async def unregister_model(self, model_id: str) -> None:
        pass
@ -374,9 +340,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            HealthResponse: A dictionary containing the health status.
        """
        try:
-            if not self.config.url:
-                return HealthResponse(status=HealthStatus.ERROR, message="vLLM URL is not set")
-
            client = self._create_client() if self.client is None else self.client
            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
            return HealthResponse(status=HealthStatus.OK)
@ -392,11 +355,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        if self.client is not None:
            return

-        if not self.config.url:
-            raise ValueError(
-                "You must provide a vLLM URL in the run.yaml file (or set the VLLM_URL environment variable)"
-            )
-
        log.info(f"Initializing vLLM client with base_url={self.config.url}")
        self.client = self._create_client()

--- a/llama_stack/providers/remote/safety/sambanova/config.py
+++ b/llama_stack/providers/remote/safety/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaSafetyConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -57,12 +57,15 @@ class ChromaIndex(EmbeddingIndex):
        self.collection = collection
        self.kvstore = kvstore

+    async def initialize(self):
+        pass
+
    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )

-        ids = [f"{c.metadata['document_id']}:chunk-{i}" for i, c in enumerate(chunks)]
+        ids = [f"{c.metadata.get('document_id', '')}:{c.chunk_id}" for c in chunks]
        await maybe_await(
            self.collection.add(
                documents=[chunk.model_dump_json() for chunk in chunks],
@ -112,6 +115,9 @@ class ChromaIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in Chroma")

+    async def delete_chunk(self, chunk_id: str) -> None:
+        raise NotImplementedError("delete_chunk is not supported in Chroma")
+
    async def query_hybrid(
        self,
        embedding: NDArray,
@ -137,9 +143,12 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
        self.client = None
        self.cache = {}
        self.kvstore: KVStore | None = None
+        self.vector_db_store = None

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.kvstore)
+        self.vector_db_store = self.kvstore
+
        if isinstance(self.config, RemoteChromaVectorIOConfig):
            log.info(f"Connecting to Chroma server at: {self.config.url}")
            url = self.config.url.rstrip("/")
@ -172,6 +181,10 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
        )

    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        if vector_db_id not in self.cache:
+            log.warning(f"Vector DB {vector_db_id} not found")
+            return
+
        await self.cache[vector_db_id].index.delete()
        del self.cache[vector_db_id]

@ -182,6 +195,8 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
+        if index is None:
+            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")

        await index.insert_chunks(chunks)

@ -193,6 +208,9 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)

+        if index is None:
+            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
+
        return await index.query_chunks(query, params)

    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex:
@ -208,3 +226,6 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
        index = VectorDBWithIndex(vector_db, ChromaIndex(self.client, collection), self.inference_api)
        self.cache[vector_db_id] = index
        return index
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -247,6 +247,16 @@ class MilvusIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Hybrid search is not supported in Milvus")

+    async def delete_chunk(self, chunk_id: str) -> None:
+        """Remove a chunk from the Milvus collection."""
+        try:
+            await asyncio.to_thread(
+                self.client.delete, collection_name=self.collection_name, filter=f'chunk_id == "{chunk_id}"'
+            )
+        except Exception as e:
+            logger.error(f"Error deleting chunk {chunk_id} from Milvus collection {self.collection_name}: {e}")
+            raise
+

 class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -369,3 +379,13 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP
                )

        return await index.query_chunks(query, params)
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        """Delete a chunk from a milvus vector store."""
+        index = await self._get_and_cache_vector_db_index(store_id)
+        if not index:
+            raise ValueError(f"Vector DB {store_id} not found")
+
+        for chunk_id in chunk_ids:
+            # Use the index's delete_chunk method
+            await index.index.delete_chunk(chunk_id)
--- a/llama_stack/providers/remote/vector_io/pgvector/init.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/init.py
@ -12,6 +12,6 @@ from .config import PGVectorVectorIOConfig
 async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .pgvector import PGVectorVectorIOAdapter

-    impl = PGVectorVectorIOAdapter(config, deps[Api.inference])
+    impl = PGVectorVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -99,7 +99,7 @@ class PGVectorIndex(EmbeddingIndex):
        for i, chunk in enumerate(chunks):
            values.append(
                (
-                    f"{chunk.metadata['document_id']}:chunk-{i}",
+                    f"{chunk.chunk_id}",
                    Json(chunk.model_dump()),
                    embeddings[i].tolist(),
                )
@ -159,6 +159,11 @@ class PGVectorIndex(EmbeddingIndex):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")

+    async def delete_chunk(self, chunk_id: str) -> None:
+        """Remove a chunk from the PostgreSQL table."""
+        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            cur.execute(f"DELETE FROM {self.table_name} WHERE id = %s", (chunk_id,))
+

 class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -265,3 +270,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn)
        self.cache[vector_db_id] = VectorDBWithIndex(vector_db, index, self.inference_api)
        return self.cache[vector_db_id]
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        """Delete a chunk from a PostgreSQL vector store."""
+        index = await self._get_and_cache_vector_db_index(store_id)
+        if not index:
+            raise ValueError(f"Vector DB {store_id} not found")
+
+        for chunk_id in chunk_ids:
+            # Use the index's delete_chunk method
+            await index.index.delete_chunk(chunk_id)
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -82,6 +82,9 @@ class QdrantIndex(EmbeddingIndex):

        await self.client.upsert(collection_name=self.collection_name, points=points)

+    async def delete_chunk(self, chunk_id: str) -> None:
+        raise NotImplementedError("delete_chunk is not supported in qdrant")
+
    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        results = (
            await self.client.query_points(
@ -307,3 +310,6 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        file_id: str,
    ) -> VectorStoreFileObject:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -66,6 +66,9 @@ class WeaviateIndex(EmbeddingIndex):
        # TODO: make this async friendly
        collection.data.insert_many(data_objects)

+    async def delete_chunk(self, chunk_id: str) -> None:
+        raise NotImplementedError("delete_chunk is not supported in Chroma")
+
    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        collection = self.client.collections.get(self.collection_name)

@ -264,3 +267,6 @@ class WeaviateVectorIOAdapter(

    async def _delete_openai_vector_store_file_from_storage(self, store_id: str, file_id: str) -> None:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Weaviate")
+
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Weaviate")
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -68,11 +68,14 @@ class LiteLLMOpenAIMixin(
    def __init__(
        self,
        model_entries,
+        litellm_provider_name: str,
        api_key_from_config: str | None,
        provider_data_api_key_field: str,
        openai_compat_api_base: str | None = None,
    ):
        ModelRegistryHelper.__init__(self, model_entries)
+
+        self.litellm_provider_name = litellm_provider_name
        self.api_key_from_config = api_key_from_config
        self.provider_data_api_key_field = provider_data_api_key_field
        self.api_base = openai_compat_api_base
@ -91,7 +94,11 @@ class LiteLLMOpenAIMixin(
    def get_litellm_model_name(self, model_id: str) -> str:
        # users may be using openai/ prefix in their model names. the openai/models.py did this by default.
        # model_id.startswith("openai/") is for backwards compatibility.
-        return "openai/" + model_id if self.is_openai_compat and not model_id.startswith("openai/") else model_id
+        return (
+            f"{self.litellm_provider_name}/{model_id}"
+            if self.is_openai_compat and not model_id.startswith(self.litellm_provider_name)
+            else model_id
+        )

    async def completion(
        self,
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@ -20,6 +20,13 @@ from llama_stack.providers.utils.inference import (
 logger = get_logger(name=__name__, category="core")


+class RemoteInferenceProviderConfig(BaseModel):
+    allowed_models: list[str] | None = Field(
+        default=None,
+        description="List of models that should be registered with the model registry. If None, all models are allowed.",
+    )
+
+
 # TODO: this class is more confusing than useful right now. We need to make it
 # more closer to the Model class.
 class ProviderModelEntry(BaseModel):
@ -43,7 +50,8 @@ def build_hf_repo_model_entry(
    additional_aliases: list[str] | None = None,
 ) -> ProviderModelEntry:
    aliases = [
-        get_huggingface_repo(model_descriptor),
+        # NOTE: avoid HF aliases because they _cannot_ be unique across providers
+        # get_huggingface_repo(model_descriptor),
    ]
    if additional_aliases:
        aliases.extend(additional_aliases)
@ -65,7 +73,12 @@ def build_model_entry(provider_model_id: str, model_descriptor: str) -> Provider


 class ModelRegistryHelper(ModelsProtocolPrivate):
-    def __init__(self, model_entries: list[ProviderModelEntry]):
+    __provider_id__: str
+
+    def __init__(self, model_entries: list[ProviderModelEntry], allowed_models: list[str] | None = None):
+        self.model_entries = model_entries
+        self.allowed_models = allowed_models
+
        self.alias_to_provider_id_map = {}
        self.provider_id_to_llama_model_map = {}
        for entry in model_entries:
@ -79,6 +92,27 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
                self.alias_to_provider_id_map[entry.llama_model] = entry.provider_model_id
                self.provider_id_to_llama_model_map[entry.provider_model_id] = entry.llama_model

+    async def list_models(self) -> list[Model] | None:
+        models = []
+        for entry in self.model_entries:
+            ids = [entry.provider_model_id] + entry.aliases
+            for id in ids:
+                if self.allowed_models and id not in self.allowed_models:
+                    continue
+                models.append(
+                    Model(
+                        identifier=id,
+                        provider_resource_id=entry.provider_model_id,
+                        model_type=ModelType.llm,
+                        metadata=entry.metadata,
+                        provider_id=self.__provider_id__,
+                    )
+                )
+        return models
+
+    async def should_refresh_models(self) -> bool:
+        return False
+
    def get_provider_model_id(self, identifier: str) -> str | None:
        return self.alias_to_provider_id_map.get(identifier, None)

@ -154,8 +188,8 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
        return model

    async def unregister_model(self, model_id: str) -> None:
-        # TODO: should we block unregistering base supported provider model IDs?
-        if model_id not in self.alias_to_provider_id_map:
-            raise ValueError(f"Model id '{model_id}' is not registered.")
-
-        del self.alias_to_provider_id_map[model_id]
+        # model_id is the identifier, not the provider_resource_id
+        # unfortunately, this ID can be of the form provider_id/model_id which
+        # we never registered. TODO: fix this by significantly rewriting
+        # registration and registry helper
+        pass
--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -152,6 +152,11 @@ class OpenAIVectorStoreMixin(ABC):
        """Load existing OpenAI vector stores into the in-memory cache."""
        self.openai_vector_stores = await self._load_openai_vector_stores()

+    @abstractmethod
+    async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None:
+        """Delete a chunk from a vector store."""
+        pass
+
    @abstractmethod
    async def register_vector_db(self, vector_db: VectorDB) -> None:
        """Register a vector database (provider-specific implementation)."""
@ -763,17 +768,15 @@ class OpenAIVectorStoreMixin(ABC):
        if vector_store_id not in self.openai_vector_stores:
            raise ValueError(f"Vector store {vector_store_id} not found")

+        dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id)
+        chunks = [Chunk.model_validate(c) for c in dict_chunks]
+        await self.delete_chunks(vector_store_id, [str(c.chunk_id) for c in chunks if c.chunk_id])
+
        store_info = self.openai_vector_stores[vector_store_id].copy()

        file = await self.openai_retrieve_vector_store_file(vector_store_id, file_id)
        await self._delete_openai_vector_store_file_from_storage(vector_store_id, file_id)

-        # TODO: We need to actually delete the embeddings from the underlying vector store...
-        # Also uncomment the corresponding integration test marked as xfail
-        #
-        # test_openai_vector_store_delete_file_removes_from_vector_store in
-        # tests/integration/vector_io/test_openai_vector_stores.py
-
        # Update in-memory cache
        store_info["file_ids"].remove(file_id)
        store_info["file_counts"][file.status] -= 1
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -231,6 +231,10 @@ class EmbeddingIndex(ABC):
    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        raise NotImplementedError()

+    @abstractmethod
+    async def delete_chunk(self, chunk_id: str):
+        raise NotImplementedError()
+
    @abstractmethod
    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        raise NotImplementedError()
--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -4,13 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
+from enum import Enum
 from typing import Any, cast

 import httpx
-from mcp import ClientSession
+from mcp import ClientSession, McpError
 from mcp import types as mcp_types
 from mcp.client.sse import sse_client
+from mcp.client.streamable_http import streamablehttp_client

 from llama_stack.apis.common.content_types import ImageContentItem, InterleavedContentItem, TextContentItem
 from llama_stack.apis.tools import (
@ -21,31 +24,61 @@ from llama_stack.apis.tools import (
 )
 from llama_stack.distribution.datatypes import AuthenticationRequiredError
 from llama_stack.log import get_logger
+from llama_stack.providers.utils.tools.ttl_dict import TTLDict

 logger = get_logger(__name__, category="tools")

+protocol_cache = TTLDict(ttl_seconds=3600)
+
+
+class MCPProtol(Enum):
+    UNKNOWN = 0
+    STREAMABLE_HTTP = 1
+    SSE = 2
+

@asynccontextmanager
-async def sse_client_wrapper(endpoint: str, headers: dict[str, str]):
-    try:
-        async with sse_client(endpoint, headers=headers) as streams:
-            async with ClientSession(*streams) as session:
-                await session.initialize()
-                yield session
-    except* httpx.HTTPStatusError as eg:
-        for exc in eg.exceptions:
-            # mypy does not currently narrow the type of `eg.exceptions` based on the `except*` filter,
-            # so we explicitly cast each item to httpx.HTTPStatusError. This is safe because
-            # `except* httpx.HTTPStatusError` guarantees all exceptions in `eg.exceptions` are of that type.
-            err = cast(httpx.HTTPStatusError, exc)
-            if err.response.status_code == 401:
-                raise AuthenticationRequiredError(exc) from exc
-        raise
+async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerator[ClientSession, Any]:
+    # we use a ttl'd dict to cache the happy path protocol for each endpoint
+    # but, we always fall back to trying the other protocol if we cannot initialize the session
+    connection_strategies = [MCPProtol.STREAMABLE_HTTP, MCPProtol.SSE]
+    mcp_protocol = protocol_cache.get(endpoint, default=MCPProtol.UNKNOWN)
+    if mcp_protocol == MCPProtol.SSE:
+        connection_strategies = [MCPProtol.SSE, MCPProtol.STREAMABLE_HTTP]
+
+    for i, strategy in enumerate(connection_strategies):
+        try:
+            client = streamablehttp_client
+            if strategy == MCPProtol.SSE:
+                client = sse_client
+            async with client(endpoint, headers=headers) as client_streams:
+                async with ClientSession(read_stream=client_streams[0], write_stream=client_streams[1]) as session:
+                    await session.initialize()
+                    protocol_cache[endpoint] = strategy
+                    yield session
+                    return
+        except* httpx.HTTPStatusError as eg:
+            for exc in eg.exceptions:
+                # mypy does not currently narrow the type of `eg.exceptions` based on the `except*` filter,
+                # so we explicitly cast each item to httpx.HTTPStatusError. This is safe because
+                # `except* httpx.HTTPStatusError` guarantees all exceptions in `eg.exceptions` are of that type.
+                err = cast(httpx.HTTPStatusError, exc)
+                if err.response.status_code == 401:
+                    raise AuthenticationRequiredError(exc) from exc
+            if i == len(connection_strategies) - 1:
+                raise
+        except* McpError:
+            if i < len(connection_strategies) - 1:
+                logger.warning(
+                    f"failed to connect via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
+            else:
+                raise


 async def list_mcp_tools(endpoint: str, headers: dict[str, str]) -> ListToolDefsResponse:
    tools = []
-    async with sse_client_wrapper(endpoint, headers) as session:
+    async with client_wrapper(endpoint, headers) as session:
        tools_result = await session.list_tools()
        for tool in tools_result.tools:
            parameters = []
@ -73,7 +106,7 @@ async def list_mcp_tools(endpoint: str, headers: dict[str, str]) -> ListToolDefs
 async def invoke_mcp_tool(
    endpoint: str, headers: dict[str, str], tool_name: str, kwargs: dict[str, Any]
 ) -> ToolInvocationResult:
-    async with sse_client_wrapper(endpoint, headers) as session:
+    async with client_wrapper(endpoint, headers) as session:
        result = await session.call_tool(tool_name, kwargs)

        content: list[InterleavedContentItem] = []
--- a/llama_stack/providers/utils/tools/ttl_dict.py
+++ b/llama_stack/providers/utils/tools/ttl_dict.py
@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from threading import RLock
+from typing import Any
+
+
+class TTLDict(dict):
+    """
+    A dictionary with a ttl for each item
+    """
+
+    def __init__(self, ttl_seconds: float, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ttl_seconds = ttl_seconds
+        self._expires: dict[Any, Any] = {}  # expires holds when an item will expire
+        self._lock = RLock()
+
+        if args or kwargs:
+            for k, v in self.items():
+                self.__setitem__(k, v)
+
+    def __delitem__(self, key):
+        with self._lock:
+            del self._expires[key]
+            super().__delitem__(key)
+
+    def __setitem__(self, key, value):
+        with self._lock:
+            self._expires[key] = time.monotonic() + self.ttl_seconds
+            super().__setitem__(key, value)
+
+    def _is_expired(self, key):
+        if key not in self._expires:
+            return False
+        return time.monotonic() > self._expires[key]
+
+    def __getitem__(self, key):
+        with self._lock:
+            if self._is_expired(key):
+                del self._expires[key]
+                super().__delitem__(key)
+                raise KeyError(f"{key} has expired and was removed")
+
+            return super().__getitem__(key)
+
+    def get(self, key, default=None):
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        try:
+            _ = self[key]
+            return True
+        except KeyError:
+            return False
+
+    def __repr__(self):
+        with self._lock:
+            for key in self.keys():
+                if self._is_expired(key):
+                    del self._expires[key]
+                    super().__delitem__(key)
+            return f"TTLDict({self.ttl_seconds}, {super().__repr__()})"
--- a/llama_stack/schema_utils.py
+++ b/llama_stack/schema_utils.py
@ -22,6 +22,7 @@ class WebMethod:
    # A descriptive name of the corresponding span created by tracing
    descriptive_name: str | None = None
    experimental: bool | None = False
+    required_scope: str | None = None


 T = TypeVar("T", bound=Callable[..., Any])
@ -36,6 +37,7 @@ def webmethod(
    raw_bytes_request_body: bool | None = False,
    descriptive_name: str | None = None,
    experimental: bool | None = False,
+    required_scope: str | None = None,
 ) -> Callable[[T], T]:
    """
    Decorator that supplies additional metadata to an endpoint operation function.
@ -45,6 +47,7 @@ def webmethod(
    :param request_examples: Sample requests that the operation might take. Pass a list of objects, not JSON.
    :param response_examples: Sample responses that the operation might produce. Pass a list of objects, not JSON.
    :param experimental: True if the operation is experimental and subject to change.
+    :param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer').
    """

    def wrap(func: T) -> T:
@ -57,6 +60,7 @@ def webmethod(
            raw_bytes_request_body=raw_bytes_request_body,
            descriptive_name=descriptive_name,
            experimental=experimental,
+            required_scope=required_scope,
        )
        return func

--- a/llama_stack/templates/ci-tests/build.yaml
+++ b/llama_stack/templates/ci-tests/build.yaml
@ -3,57 +3,52 @@ distribution_spec:
  description: CI tests for Llama Stack
  providers:
    inference:
-    - remote::cerebras
-    - remote::ollama
-    - remote::vllm
-    - remote::tgi
-    - remote::hf::serverless
-    - remote::hf::endpoint
-    - remote::fireworks
-    - remote::together
-    - remote::bedrock
-    - remote::databricks
-    - remote::nvidia
-    - remote::runpod
-    - remote::openai
-    - remote::anthropic
-    - remote::gemini
-    - remote::groq
-    - remote::llama-openai-compat
-    - remote::sambanova
-    - remote::passthrough
-    - inline::sentence-transformers
+    - provider_type: remote::cerebras
+    - provider_type: remote::ollama
+    - provider_type: remote::vllm
+    - provider_type: remote::tgi
+    - provider_type: remote::fireworks
+    - provider_type: remote::together
+    - provider_type: remote::bedrock
+    - provider_type: remote::nvidia
+    - provider_type: remote::openai
+    - provider_type: remote::anthropic
+    - provider_type: remote::gemini
+    - provider_type: remote::groq
+    - provider_type: remote::sambanova
+    - provider_type: inline::sentence-transformers
    vector_io:
-    - inline::faiss
-    - inline::sqlite-vec
-    - inline::milvus
-    - remote::chromadb
-    - remote::pgvector
+    - provider_type: inline::faiss
+    - provider_type: inline::sqlite-vec
+    - provider_type: inline::milvus
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
    files:
-    - inline::localfs
+    - provider_type: inline::localfs
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    post_training:
-    - inline::huggingface
+    - provider_type: inline::huggingface
    eval:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    datasetio:
-    - remote::huggingface
-    - inline::localfs
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
 image_type: conda
+image_name: ci-tests
 additional_pip_packages:
 - aiosqlite
 - asyncpg
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
--- a/llama_stack/templates/dell/build.yaml
+++ b/llama_stack/templates/dell/build.yaml
@ -4,32 +4,33 @@ distribution_spec:
    container
  providers:
    inference:
-    - remote::tgi
-    - inline::sentence-transformers
+    - provider_type: remote::tgi
+    - provider_type: inline::sentence-transformers
    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
+    - provider_type: inline::faiss
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    eval:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    datasetio:
-    - remote::huggingface
-    - inline::localfs
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
 image_type: conda
+image_name: dell
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/dell/dell.py
+++ b/llama_stack/templates/dell/dell.py
@ -6,6 +6,7 @@

 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
+    BuildProvider,
    ModelInput,
    Provider,
    ShieldInput,
@ -19,18 +20,32 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin

 def get_distribution_template() -> DistributionTemplate:
    providers = {
-        "inference": ["remote::tgi", "inline::sentence-transformers"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "inference": [
+            BuildProvider(provider_type="remote::tgi"),
+            BuildProvider(provider_type="inline::sentence-transformers"),
+        ],
+        "vector_io": [
+            BuildProvider(provider_type="inline::faiss"),
+            BuildProvider(provider_type="remote::chromadb"),
+            BuildProvider(provider_type="remote::pgvector"),
+        ],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
+        "eval": [BuildProvider(provider_type="inline::meta-reference")],
+        "datasetio": [
+            BuildProvider(provider_type="remote::huggingface"),
+            BuildProvider(provider_type="inline::localfs"),
+        ],
+        "scoring": [
+            BuildProvider(provider_type="inline::basic"),
+            BuildProvider(provider_type="inline::llm-as-judge"),
+            BuildProvider(provider_type="inline::braintrust"),
+        ],
        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
        ],
    }
    name = "dell"
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -22,7 +22,6 @@ providers:
      url: ${env.DEH_SAFETY_URL}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
-    config: {}
  vector_io:
  - provider_id: chromadb
    provider_type: remote::chromadb
@ -74,10 +73,8 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
-    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
@ -95,7 +92,6 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -18,7 +18,6 @@ providers:
      url: ${env.DEH_URL}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
-    config: {}
  vector_io:
  - provider_id: chromadb
    provider_type: remote::chromadb
@ -70,10 +69,8 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
-    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
@ -91,7 +88,6 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
--- a/llama_stack/templates/meta-reference-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@ -3,32 +3,33 @@ distribution_spec:
  description: Use Meta Reference for running LLM inference
  providers:
    inference:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    vector_io:
-    - inline::faiss
-    - remote::chromadb
-    - remote::pgvector
+    - provider_type: inline::faiss
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    eval:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    datasetio:
-    - remote::huggingface
-    - inline::localfs
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
 image_type: conda
+image_name: meta-reference-gpu
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@ -8,6 +8,7 @@ from pathlib import Path

 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
+    BuildProvider,
    ModelInput,
    Provider,
    ShieldInput,
@ -25,19 +26,30 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin

 def get_distribution_template() -> DistributionTemplate:
    providers = {
-        "inference": ["inline::meta-reference"],
-        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "inference": [BuildProvider(provider_type="inline::meta-reference")],
+        "vector_io": [
+            BuildProvider(provider_type="inline::faiss"),
+            BuildProvider(provider_type="remote::chromadb"),
+            BuildProvider(provider_type="remote::pgvector"),
+        ],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
+        "eval": [BuildProvider(provider_type="inline::meta-reference")],
+        "datasetio": [
+            BuildProvider(provider_type="remote::huggingface"),
+            BuildProvider(provider_type="inline::localfs"),
+        ],
+        "scoring": [
+            BuildProvider(provider_type="inline::basic"),
+            BuildProvider(provider_type="inline::llm-as-judge"),
+            BuildProvider(provider_type="inline::braintrust"),
+        ],
        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
+            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    name = "meta-reference-gpu"
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -24,7 +24,6 @@ providers:
      max_seq_len: ${env.MAX_SEQ_LEN:=4096}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
-    config: {}
  - provider_id: meta-reference-safety
    provider_type: inline::meta-reference
    config:
@ -88,10 +87,8 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
-    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
@ -109,10 +106,8 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/registry.db
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -24,7 +24,6 @@ providers:
      max_seq_len: ${env.MAX_SEQ_LEN:=4096}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
-    config: {}
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
@ -78,10 +77,8 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
-    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
@ -99,10 +96,8 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/registry.db
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@ -3,27 +3,28 @@ distribution_spec:
  description: Use NVIDIA NIM for running LLM inference, evaluation and safety
  providers:
    inference:
-    - remote::nvidia
+    - provider_type: remote::nvidia
    vector_io:
-    - inline::faiss
+    - provider_type: inline::faiss
    safety:
-    - remote::nvidia
+    - provider_type: remote::nvidia
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    eval:
-    - remote::nvidia
+    - provider_type: remote::nvidia
    post_training:
-    - remote::nvidia
+    - provider_type: remote::nvidia
    datasetio:
-    - inline::localfs
-    - remote::nvidia
+    - provider_type: inline::localfs
+    - provider_type: remote::nvidia
    scoring:
-    - inline::basic
+    - provider_type: inline::basic
    tool_runtime:
-    - inline::rag-runtime
+    - provider_type: inline::rag-runtime
 image_type: conda
+image_name: nvidia
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@ -1,3 +1,6 @@
+---
+orphan: true
+---
 # NVIDIA Distribution

 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -6,7 +6,7 @@

 from pathlib import Path

-from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
+from llama_stack.distribution.datatypes import BuildProvider, ModelInput, Provider, ShieldInput, ToolGroupInput
 from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
 from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
@ -17,16 +17,19 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin

 def get_distribution_template() -> DistributionTemplate:
    providers = {
-        "inference": ["remote::nvidia"],
-        "vector_io": ["inline::faiss"],
-        "safety": ["remote::nvidia"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["remote::nvidia"],
-        "post_training": ["remote::nvidia"],
-        "datasetio": ["inline::localfs", "remote::nvidia"],
-        "scoring": ["inline::basic"],
-        "tool_runtime": ["inline::rag-runtime"],
+        "inference": [BuildProvider(provider_type="remote::nvidia")],
+        "vector_io": [BuildProvider(provider_type="inline::faiss")],
+        "safety": [BuildProvider(provider_type="remote::nvidia")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
+        "eval": [BuildProvider(provider_type="remote::nvidia")],
+        "post_training": [BuildProvider(provider_type="remote::nvidia")],
+        "datasetio": [
+            BuildProvider(provider_type="inline::localfs"),
+            BuildProvider(provider_type="remote::nvidia"),
+        ],
+        "scoring": [BuildProvider(provider_type="inline::basic")],
+        "tool_runtime": [BuildProvider(provider_type="inline::rag-runtime")],
    }

    inference_provider = Provider(
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -85,11 +85,9 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  tool_runtime:
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -74,11 +74,9 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  tool_runtime:
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
@ -91,101 +89,51 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama3-8b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3-8B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-8b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama3-70b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama3-70b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama3-70b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.1-8b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.1-8b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-8b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.1-70b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.1-70b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-70b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.1-405b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.1-405b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.1-405b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.2-1b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-1b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-1b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.2-3b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-3b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-3b-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.2-11b-vision-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-11b-vision-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-11b-vision-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.2-90b-vision-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-90b-vision-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.2-90b-vision-instruct
-  model_type: llm
 - metadata: {}
  model_id: meta/llama-3.3-70b-instruct
  provider_id: nvidia
  provider_model_id: meta/llama-3.3-70b-instruct
  model_type: llm
- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: nvidia
-  provider_model_id: meta/llama-3.3-70b-instruct
-  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/llama_stack/templates/open-benchmark/build.yaml
+++ b/llama_stack/templates/open-benchmark/build.yaml
@ -3,36 +3,37 @@ distribution_spec:
  description: Distribution for running open benchmarks
  providers:
    inference:
-    - remote::openai
-    - remote::anthropic
-    - remote::gemini
-    - remote::groq
-    - remote::together
+    - provider_type: remote::openai
+    - provider_type: remote::anthropic
+    - provider_type: remote::gemini
+    - provider_type: remote::groq
+    - provider_type: remote::together
    vector_io:
-    - inline::sqlite-vec
-    - remote::chromadb
-    - remote::pgvector
+    - provider_type: inline::sqlite-vec
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    eval:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    datasetio:
-    - remote::huggingface
-    - inline::localfs
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
 image_type: conda
+image_name: open-benchmark
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -9,6 +9,7 @@ from llama_stack.apis.datasets import DatasetPurpose, URIDataSource
 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
    BenchmarkInput,
+    BuildProvider,
    DatasetInput,
    ModelInput,
    Provider,
@ -96,19 +97,30 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
 def get_distribution_template() -> DistributionTemplate:
    inference_providers, available_models = get_inference_providers()
    providers = {
-        "inference": [p.provider_type for p in inference_providers],
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in inference_providers],
+        "vector_io": [
+            BuildProvider(provider_type="inline::sqlite-vec"),
+            BuildProvider(provider_type="remote::chromadb"),
+            BuildProvider(provider_type="remote::pgvector"),
+        ],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
+        "eval": [BuildProvider(provider_type="inline::meta-reference")],
+        "datasetio": [
+            BuildProvider(provider_type="remote::huggingface"),
+            BuildProvider(provider_type="inline::localfs"),
+        ],
+        "scoring": [
+            BuildProvider(provider_type="inline::basic"),
+            BuildProvider(provider_type="inline::llm-as-judge"),
+            BuildProvider(provider_type="inline::braintrust"),
+        ],
        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
+            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    name = "open-benchmark"
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -33,7 +33,7 @@ providers:
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY}
+      api_key: ${env.TOGETHER_API_KEY:=}
  vector_io:
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
@ -106,10 +106,8 @@ providers:
  scoring:
  - provider_id: basic
    provider_type: inline::basic
-    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
-    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
@ -127,10 +125,8 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/registry.db
--- a/llama_stack/templates/postgres-demo/build.yaml
+++ b/llama_stack/templates/postgres-demo/build.yaml
@ -3,22 +3,23 @@ distribution_spec:
  description: Quick start template for running Llama Stack with several popular providers
  providers:
    inference:
-    - remote::vllm
-    - inline::sentence-transformers
+    - provider_type: remote::vllm
+    - provider_type: inline::sentence-transformers
    vector_io:
-    - remote::chromadb
+    - provider_type: remote::chromadb
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
 image_type: conda
+image_name: postgres-demo
 additional_pip_packages:
 - asyncpg
 - psycopg2-binary
--- a/llama_stack/templates/postgres-demo/postgres_demo.py
+++ b/llama_stack/templates/postgres-demo/postgres_demo.py
@ -7,6 +7,7 @@

 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
+    BuildProvider,
    ModelInput,
    Provider,
    ShieldInput,
@ -34,16 +35,19 @@ def get_distribution_template() -> DistributionTemplate:
        ),
    ]
    providers = {
-        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
-        "vector_io": ["remote::chromadb"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
+        "inference": [
+            BuildProvider(provider_type="remote::vllm"),
+            BuildProvider(provider_type="inline::sentence-transformers"),
+        ],
+        "vector_io": [BuildProvider(provider_type="remote::chromadb")],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
+            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    name = "postgres-demo"
--- a/llama_stack/templates/postgres-demo/run.yaml
+++ b/llama_stack/templates/postgres-demo/run.yaml
@ -18,7 +18,6 @@ providers:
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
-    config: {}
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
@ -70,10 +69,8 @@ providers:
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
-    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
-    config: {}
 metadata_store:
  type: postgres
  host: ${env.POSTGRES_HOST:=localhost}
--- a/llama_stack/templates/starter/build.yaml
+++ b/llama_stack/templates/starter/build.yaml
@ -3,57 +3,52 @@ distribution_spec:
  description: Quick start template for running Llama Stack with several popular providers
  providers:
    inference:
-    - remote::cerebras
-    - remote::ollama
-    - remote::vllm
-    - remote::tgi
-    - remote::hf::serverless
-    - remote::hf::endpoint
-    - remote::fireworks
-    - remote::together
-    - remote::bedrock
-    - remote::databricks
-    - remote::nvidia
-    - remote::runpod
-    - remote::openai
-    - remote::anthropic
-    - remote::gemini
-    - remote::groq
-    - remote::llama-openai-compat
-    - remote::sambanova
-    - remote::passthrough
-    - inline::sentence-transformers
+    - provider_type: remote::cerebras
+    - provider_type: remote::ollama
+    - provider_type: remote::vllm
+    - provider_type: remote::tgi
+    - provider_type: remote::fireworks
+    - provider_type: remote::together
+    - provider_type: remote::bedrock
+    - provider_type: remote::nvidia
+    - provider_type: remote::openai
+    - provider_type: remote::anthropic
+    - provider_type: remote::gemini
+    - provider_type: remote::groq
+    - provider_type: remote::sambanova
+    - provider_type: inline::sentence-transformers
    vector_io:
-    - inline::faiss
-    - inline::sqlite-vec
-    - inline::milvus
-    - remote::chromadb
-    - remote::pgvector
+    - provider_type: inline::faiss
+    - provider_type: inline::sqlite-vec
+    - provider_type: inline::milvus
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
    files:
-    - inline::localfs
+    - provider_type: inline::localfs
    safety:
-    - inline::llama-guard
+    - provider_type: inline::llama-guard
    agents:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    telemetry:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    post_training:
-    - inline::huggingface
+    - provider_type: inline::huggingface
    eval:
-    - inline::meta-reference
+    - provider_type: inline::meta-reference
    datasetio:
-    - remote::huggingface
-    - inline::localfs
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
 image_type: conda
+image_name: starter
 additional_pip_packages:
 - aiosqlite
 - asyncpg
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@ -7,19 +7,19 @@

 from typing import Any

-from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
-    ModelInput,
+    BuildProvider,
    Provider,
    ProviderSpec,
+    ShieldInput,
    ToolGroupInput,
 )
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
+from llama_stack.providers.datatypes import RemoteProviderSpec
 from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
-from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.inline.vector_io.milvus.config import (
    MilvusVectorIOConfig,
@ -28,117 +28,17 @@ from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
    SQLiteVectorIOConfig,
 )
 from llama_stack.providers.registry.inference import available_providers
-from llama_stack.providers.remote.inference.anthropic.models import (
-    MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.bedrock.models import (
-    MODEL_ENTRIES as BEDROCK_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.cerebras.models import (
-    MODEL_ENTRIES as CEREBRAS_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.databricks.databricks import (
-    MODEL_ENTRIES as DATABRICKS_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.fireworks.models import (
-    MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.gemini.models import (
-    MODEL_ENTRIES as GEMINI_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.groq.models import (
-    MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.nvidia.models import (
-    MODEL_ENTRIES as NVIDIA_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.openai.models import (
-    MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.runpod.runpod import (
-    MODEL_ENTRIES as RUNPOD_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.sambanova.models import (
-    MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.together.models import (
-    MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES,
-)
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.remote.vector_io.pgvector.config import (
    PGVectorVectorIOConfig,
 )
-from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
 from llama_stack.templates.template import (
    DistributionTemplate,
    RunConfigSettings,
-    get_model_registry,
-    get_shield_registry,
 )


-def _get_model_entries_for_provider(provider_type: str) -> list[ProviderModelEntry]:
-    """Get model entries for a specific provider type."""
-    model_entries_map = {
-        "openai": OPENAI_MODEL_ENTRIES,
-        "fireworks": FIREWORKS_MODEL_ENTRIES,
-        "together": TOGETHER_MODEL_ENTRIES,
-        "anthropic": ANTHROPIC_MODEL_ENTRIES,
-        "gemini": GEMINI_MODEL_ENTRIES,
-        "groq": GROQ_MODEL_ENTRIES,
-        "sambanova": SAMBANOVA_MODEL_ENTRIES,
-        "cerebras": CEREBRAS_MODEL_ENTRIES,
-        "bedrock": BEDROCK_MODEL_ENTRIES,
-        "databricks": DATABRICKS_MODEL_ENTRIES,
-        "nvidia": NVIDIA_MODEL_ENTRIES,
-        "runpod": RUNPOD_MODEL_ENTRIES,
-    }
-
-    # Special handling for providers with dynamic model entries
-    if provider_type == "ollama":
-        return [
-            ProviderModelEntry(
-                provider_model_id="${env.OLLAMA_INFERENCE_MODEL:=__disabled__}",
-                model_type=ModelType.llm,
-            ),
-            ProviderModelEntry(
-                provider_model_id="${env.SAFETY_MODEL:=__disabled__}",
-                model_type=ModelType.llm,
-            ),
-            ProviderModelEntry(
-                provider_model_id="${env.OLLAMA_EMBEDDING_MODEL:=__disabled__}",
-                model_type=ModelType.embedding,
-                metadata={
-                    "embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:=384}",
-                },
-            ),
-        ]
-    elif provider_type == "vllm":
-        return [
-            ProviderModelEntry(
-                provider_model_id="${env.VLLM_INFERENCE_MODEL:=__disabled__}",
-                model_type=ModelType.llm,
-            ),
-        ]
-
-    return model_entries_map.get(provider_type, [])
-
-
-def _get_model_safety_entries_for_provider(provider_type: str) -> list[ProviderModelEntry]:
-    """Get model entries for a specific provider type."""
-    safety_model_entries_map = {
-        "ollama": [
-            ProviderModelEntry(
-                provider_model_id="${env.SAFETY_MODEL:=__disabled__}",
-                model_type=ModelType.llm,
-            ),
-        ],
-    }
-
-    return safety_model_entries_map.get(provider_type, [])
-
-
 def _get_config_for_provider(provider_spec: ProviderSpec) -> dict[str, Any]:
    """Get configuration for a provider using its adapter's config class."""
    config_class = instantiate_class_type(provider_spec.config_class)
@ -149,40 +49,48 @@ def _get_config_for_provider(provider_spec: ProviderSpec) -> dict[str, Any]:
    return {}


-def get_remote_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
-    all_providers = available_providers()
+ENABLED_INFERENCE_PROVIDERS = [
+    "ollama",
+    "vllm",
+    "tgi",
+    "fireworks",
+    "together",
+    "gemini",
+    "groq",
+    "sambanova",
+    "anthropic",
+    "openai",
+    "cerebras",
+    "nvidia",
+    "bedrock",
+]

-    # Filter out inline providers and watsonx - the starter distro only exposes remote providers
+INFERENCE_PROVIDER_IDS = {
+    "vllm": "${env.VLLM_URL:+vllm}",
+    "tgi": "${env.TGI_URL:+tgi}",
+    "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
+    "nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
+}
+
+
+def get_remote_inference_providers() -> list[Provider]:
+    # Filter out inline providers and some others - the starter distro only exposes remote providers
    remote_providers = [
        provider
-        for provider in all_providers
-        # TODO: re-add once the Python 3.13 issue is fixed
-        # discussion: https://github.com/meta-llama/llama-stack/pull/2327#discussion_r2156883828
-        if hasattr(provider, "adapter") and provider.adapter.adapter_type != "watsonx"
+        for provider in available_providers()
+        if isinstance(provider, RemoteProviderSpec) and provider.adapter.adapter_type in ENABLED_INFERENCE_PROVIDERS
    ]

-    providers = []
-    available_models = {}
-
+    inference_providers = []
    for provider_spec in remote_providers:
        provider_type = provider_spec.adapter.adapter_type

-        # Build the environment variable name for enabling this provider
-        env_var = f"ENABLE_{provider_type.upper().replace('-', '_').replace('::', '_')}"
-        model_entries = _get_model_entries_for_provider(provider_type)
+        if provider_type in INFERENCE_PROVIDER_IDS:
+            provider_id = INFERENCE_PROVIDER_IDS[provider_type]
+        else:
+            provider_id = provider_type.replace("-", "_").replace("::", "_")
        config = _get_config_for_provider(provider_spec)
-        providers.append(
-            (
-                f"${{env.{env_var}:=__disabled__}}",
-                provider_type,
-                model_entries,
-                config,
-            )
-        )
-        available_models[f"${{env.{env_var}:=__disabled__}}"] = model_entries

-    inference_providers = []
-    for provider_id, provider_type, model_entries, config in providers:
        inference_providers.append(
            Provider(
                provider_id=provider_id,
@ -190,84 +98,43 @@ def get_remote_inference_providers() -> tuple[list[Provider], dict[str, list[Pro
                config=config,
            )
        )
-        available_models[provider_id] = model_entries
-    return inference_providers, available_models
-
-
-# build a list of shields for all possible providers
-def get_safety_models_for_providers(providers: list[Provider]) -> dict[str, list[ProviderModelEntry]]:
-    available_models = {}
-    for provider in providers:
-        provider_type = provider.provider_type.split("::")[1]
-        safety_model_entries = _get_model_safety_entries_for_provider(provider_type)
-        if len(safety_model_entries) == 0:
-            continue
-
-        env_var = f"ENABLE_{provider_type.upper().replace('-', '_').replace('::', '_')}"
-        provider_id = f"${{env.{env_var}:=__disabled__}}"
-
-        available_models[provider_id] = safety_model_entries
-
-    return available_models
+    return inference_providers


 def get_distribution_template() -> DistributionTemplate:
-    remote_inference_providers, available_models = get_remote_inference_providers()
-
+    remote_inference_providers = get_remote_inference_providers()
    name = "starter"

-    vector_io_providers = [
-        Provider(
-            provider_id="${env.ENABLE_FAISS:=faiss}",
-            provider_type="inline::faiss",
-            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_SQLITE_VEC:=__disabled__}",
-            provider_type="inline::sqlite-vec",
-            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_MILVUS:=__disabled__}",
-            provider_type="inline::milvus",
-            config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB:=__disabled__}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(
-                f"~/.llama/distributions/{name}/",
-                url="${env.CHROMADB_URL:=}",
-            ),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_PGVECTOR:=__disabled__}",
-            provider_type="remote::pgvector",
-            config=PGVectorVectorIOConfig.sample_run_config(
-                f"~/.llama/distributions/{name}",
-                db="${env.PGVECTOR_DB:=}",
-                user="${env.PGVECTOR_USER:=}",
-                password="${env.PGVECTOR_PASSWORD:=}",
-            ),
-        ),
-    ]
-
    providers = {
-        "inference": ([p.provider_type for p in remote_inference_providers] + ["inline::sentence-transformers"]),
-        "vector_io": ([p.provider_type for p in vector_io_providers]),
-        "files": ["inline::localfs"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "post_training": ["inline::huggingface"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
+        + [BuildProvider(provider_type="inline::sentence-transformers")],
+        "vector_io": [
+            BuildProvider(provider_type="inline::faiss"),
+            BuildProvider(provider_type="inline::sqlite-vec"),
+            BuildProvider(provider_type="inline::milvus"),
+            BuildProvider(provider_type="remote::chromadb"),
+            BuildProvider(provider_type="remote::pgvector"),
+        ],
+        "files": [BuildProvider(provider_type="inline::localfs")],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
+        "post_training": [BuildProvider(provider_type="inline::huggingface")],
+        "eval": [BuildProvider(provider_type="inline::meta-reference")],
+        "datasetio": [
+            BuildProvider(provider_type="remote::huggingface"),
+            BuildProvider(provider_type="inline::localfs"),
+        ],
+        "scoring": [
+            BuildProvider(provider_type="inline::basic"),
+            BuildProvider(provider_type="inline::llm-as-judge"),
+            BuildProvider(provider_type="inline::braintrust"),
+        ],
        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
+            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    files_provider = Provider(
@ -276,15 +143,10 @@ def get_distribution_template() -> DistributionTemplate:
        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
    )
    embedding_provider = Provider(
-        provider_id="${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers}",
+        provider_id="sentence-transformers",
        provider_type="inline::sentence-transformers",
        config=SentenceTransformersInferenceConfig.sample_run_config(),
    )
-    post_training_provider = Provider(
-        provider_id="huggingface",
-        provider_type="inline::huggingface",
-        config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-    )
    default_tool_groups = [
        ToolGroupInput(
            toolgroup_id="builtin::websearch",
@ -295,19 +157,14 @@ def get_distribution_template() -> DistributionTemplate:
            provider_id="rag-runtime",
        ),
    ]
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id=embedding_provider.provider_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-
-    default_models, ids_conflict_in_models = get_model_registry(available_models)
-
-    available_safety_models = get_safety_models_for_providers(remote_inference_providers)
-    shields = get_shield_registry(available_safety_models, ids_conflict_in_models)
+    default_shields = [
+        # if the
+        ShieldInput(
+            shield_id="llama-guard",
+            provider_id="${env.SAFETY_MODEL:+llama-guard}",
+            provider_shield_id="${env.SAFETY_MODEL:=}",
+        ),
+    ]

    return DistributionTemplate(
        name=name,
@ -316,20 +173,51 @@ def get_distribution_template() -> DistributionTemplate:
        container_image=None,
        template_path=None,
        providers=providers,
-        available_models_by_provider=available_models,
        additional_pip_packages=PostgresSqlStoreConfig.pip_packages(),
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": remote_inference_providers + [embedding_provider],
-                    "vector_io": vector_io_providers,
+                    "vector_io": [
+                        Provider(
+                            provider_id="faiss",
+                            provider_type="inline::faiss",
+                            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+                        ),
+                        Provider(
+                            provider_id="sqlite-vec",
+                            provider_type="inline::sqlite-vec",
+                            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+                        ),
+                        Provider(
+                            provider_id="${env.MILVUS_URL:+milvus}",
+                            provider_type="inline::milvus",
+                            config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+                        ),
+                        Provider(
+                            provider_id="${env.CHROMADB_URL:+chromadb}",
+                            provider_type="remote::chromadb",
+                            config=ChromaVectorIOConfig.sample_run_config(
+                                f"~/.llama/distributions/{name}/",
+                                url="${env.CHROMADB_URL:=}",
+                            ),
+                        ),
+                        Provider(
+                            provider_id="${env.PGVECTOR_DB:+pgvector}",
+                            provider_type="remote::pgvector",
+                            config=PGVectorVectorIOConfig.sample_run_config(
+                                f"~/.llama/distributions/{name}",
+                                db="${env.PGVECTOR_DB:=}",
+                                user="${env.PGVECTOR_USER:=}",
+                                password="${env.PGVECTOR_PASSWORD:=}",
+                            ),
+                        ),
+                    ],
                    "files": [files_provider],
-                    "post_training": [post_training_provider],
                },
-                default_models=[embedding_model] + default_models,
+                default_models=[],
                default_tool_groups=default_tool_groups,
-                # TODO: add a way to enable/disable shields on the fly
-                default_shields=shields,
+                default_shields=default_shields,
            ),
        },
        run_config_env_vars={
@ -373,17 +261,5 @@ def get_distribution_template() -> DistributionTemplate:
                "http://localhost:11434",
                "Ollama URL",
            ),
-            "OLLAMA_INFERENCE_MODEL": (
-                "",
-                "Optional Ollama Inference Model to register on startup",
-            ),
-            "OLLAMA_EMBEDDING_MODEL": (
-                "",
-                "Optional Ollama Embedding Model to register on startup",
-            ),
-            "OLLAMA_EMBEDDING_DIMENSION": (
-                "384",
-                "Ollama Embedding Dimension",
-            ),
        },
    )
--- a/Show more
+++ b/Show more