Merge branch 'main' into nvidia-e2e-notebook

2025-07-21 03:59:42 +00:00 · 2025-04-28 12:00:11 -04:00 · 2025-04-28 12:00:11 -04:00 · 73275f07b7
commit 73275f07b7
parent e24959ea9e c149cf2e0f
123 changed files with 6946 additions and 2220 deletions
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):

@json_schema_type
 class AgentConfig(AgentConfigCommon):
+    """Configuration for an agent.
+
+    :param model: The model identifier to use for the agent
+    :param instructions: The system instructions for the agent
+    :param name: Optional name for the agent, used in telemetry and identification
+    :param enable_session_persistence: Optional flag indicating whether session data has to be persisted
+    :param response_format: Optional response format configuration
+    """
+
    model: str
    instructions: str
+    name: Optional[str] = None
    enable_session_persistence: Optional[bool] = False
    response_format: Optional[ResponseFormat] = None

--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
    """

    role: Literal["assistant"] = "assistant"
-    content: OpenAIChatCompletionMessageContent
+    content: Optional[OpenAIChatCompletionMessageContent] = None
    name: Optional[str] = None
-    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
+    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None


@json_schema_type
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -136,12 +136,13 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        )

        image_type = prompt(
-            f"> Enter the image type you want your Llama Stack to be built as ({' or '.join(e.value for e in ImageType)}): ",
+            "> Enter the image type you want your Llama Stack to be built as (use <TAB> to see options): ",
+            completer=WordCompleter([e.value for e in ImageType]),
+            complete_while_typing=True,
            validator=Validator.from_callable(
                lambda x: x in [e.value for e in ImageType],
-                error_message=f"Invalid image type, please enter {' or '.join(e.value for e in ImageType)}",
+                error_message="Invalid image type. Use <TAB> to see options",
            ),
-            default=ImageType.CONDA.value,
        )

        if image_type == ImageType.CONDA.value:
@ -210,16 +211,9 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                )
                sys.exit(1)

-        if build_config.image_type == LlamaStackImageType.CONTAINER.value and not args.image_name:
-            cprint(
-                "Please specify --image-name when building a container from a config file",
-                color="red",
-            )
-            sys.exit(1)
-
    if args.print_deps_only:
        print(f"# Dependencies for {args.template or args.config or image_name}")
-        normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
+        normal_deps, special_deps = get_provider_dependencies(build_config)
        normal_deps += SERVER_DEPENDENCIES
        print(f"uv pip install {' '.join(normal_deps)}")
        for special_dep in special_deps:
@ -235,10 +229,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        )

    except (Exception, RuntimeError) as exc:
+        import traceback
+
        cprint(
            f"Error building stack: {exc}",
            color="red",
        )
+        cprint("Stack trace:", color="red")
+        traceback.print_exc()
        sys.exit(1)
    if run_config is None:
        cprint(
@ -270,9 +268,10 @@ def _generate_run_config(
        image_name=image_name,
        apis=apis,
        providers={},
+        external_providers_dir=build_config.external_providers_dir if build_config.external_providers_dir else None,
    )
    # build providers dict
-    provider_registry = get_provider_registry()
+    provider_registry = get_provider_registry(build_config)
    for api in apis:
        run_config.providers[api] = []
        provider_types = build_config.distribution_spec.providers[api]
@ -286,8 +285,22 @@ def _generate_run_config(
            if p.deprecation_error:
                raise InvalidProviderError(p.deprecation_error)

-            config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
-            if hasattr(config_type, "sample_run_config"):
+            try:
+                config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
+            except ModuleNotFoundError:
+                # HACK ALERT:
+                # This code executes after building is done, the import cannot work since the
+                # package is either available in the venv or container - not available on the host.
+                # TODO: use a "is_external" flag in ProviderSpec to check if the provider is
+                # external
+                cprint(
+                    f"Failed to import provider {provider_type} for API {api} - assuming it's external, skipping",
+                    color="yellow",
+                )
+                # Set config_type to None to avoid UnboundLocalError
+                config_type = None
+
+            if config_type is not None and hasattr(config_type, "sample_run_config"):
                config = config_type.sample_run_config(__distro_dir__=f"~/.llama/distributions/{image_name}")
            else:
                config = {}
@ -305,11 +318,15 @@ def _generate_run_config(
        to_write = json.loads(run_config.model_dump_json())
        f.write(yaml.dump(to_write, sort_keys=False))

-    # this path is only invoked when no template is provided
-    cprint(
-        f"You can now run your stack with `llama stack run {run_config_file}`",
-        color="green",
-    )
+    # Only print this message for non-container builds since it will be displayed before the
+    # container is built
+    # For non-container builds, the run.yaml is generated at the very end of the build process so it
+    # makes sense to display this message
+    if build_config.image_type != LlamaStackImageType.CONTAINER.value:
+        cprint(
+            f"You can now run your stack with `llama stack run {run_config_file}`",
+            color="green",
+        )
    return run_config_file


@ -319,6 +336,7 @@ def _run_stack_build_command_from_build_config(
    template_name: Optional[str] = None,
    config_path: Optional[str] = None,
 ) -> str:
+    image_name = image_name or build_config.image_name
    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
        if template_name:
            image_name = f"distribution-{template_name}"
@ -342,6 +360,13 @@ def _run_stack_build_command_from_build_config(
        build_file_path = build_dir / f"{image_name}-build.yaml"

    os.makedirs(build_dir, exist_ok=True)
+    run_config_file = None
+    # Generate the run.yaml so it can be included in the container image with the proper entrypoint
+    # Only do this if we're building a container image and we're not using a template
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value and not template_name and config_path:
+        cprint("Generating run.yaml file", color="green")
+        run_config_file = _generate_run_config(build_config, build_dir, image_name)
+
    with open(build_file_path, "w") as f:
        to_write = json.loads(build_config.model_dump_json())
        f.write(yaml.dump(to_write, sort_keys=False))
@ -350,7 +375,8 @@ def _run_stack_build_command_from_build_config(
        build_config,
        build_file_path,
        image_name,
-        template_or_config=template_name or config_path,
+        template_or_config=template_name or config_path or str(build_file_path),
+        run_config=run_config_file,
    )
    if return_code != 0:
        raise RuntimeError(f"Failed to build image {image_name}")
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -7,16 +7,16 @@
 import importlib.resources
 import logging
 from pathlib import Path
-from typing import Dict, List

 from pydantic import BaseModel
 from termcolor import cprint

-from llama_stack.distribution.datatypes import BuildConfig, Provider
+from llama_stack.distribution.datatypes import BuildConfig
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.utils.exec import run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
+from llama_stack.templates.template import DistributionTemplate

 log = logging.getLogger(__name__)

@ -37,19 +37,24 @@ class ApiInput(BaseModel):


 def get_provider_dependencies(
-    config_providers: Dict[str, List[Provider]],
+    config: BuildConfig | DistributionTemplate,
 ) -> tuple[list[str], list[str]]:
    """Get normal and special dependencies from provider configuration."""
-    all_providers = get_provider_registry()
+    # Extract providers based on config type
+    if isinstance(config, DistributionTemplate):
+        providers = config.providers
+    elif isinstance(config, BuildConfig):
+        providers = config.distribution_spec.providers
    deps = []
+    registry = get_provider_registry(config)

-    for api_str, provider_or_providers in config_providers.items():
-        providers_for_api = all_providers[Api(api_str)]
+    for api_str, provider_or_providers in providers.items():
+        providers_for_api = registry[Api(api_str)]

        providers = provider_or_providers if isinstance(provider_or_providers, list) else [provider_or_providers]

        for provider in providers:
-            # Providers from BuildConfig and RunConfig are subtly different – not great
+            # Providers from BuildConfig and RunConfig are subtly different – not great
            provider_type = provider if isinstance(provider, str) else provider.provider_type

            if provider_type not in providers_for_api:
@ -71,8 +76,8 @@ def get_provider_dependencies(
    return list(set(normal_deps)), list(set(special_deps))


-def print_pip_install_help(providers: Dict[str, List[Provider]]):
-    normal_deps, special_deps = get_provider_dependencies(providers)
+def print_pip_install_help(config: BuildConfig):
+    normal_deps, special_deps = get_provider_dependencies(config)

    cprint(
        f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
@ -88,10 +93,11 @@ def build_image(
    build_file_path: Path,
    image_name: str,
    template_or_config: str,
+    run_config: str | None = None,
 ):
    container_base = build_config.distribution_spec.container_image or "python:3.10-slim"

-    normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
+    normal_deps, special_deps = get_provider_dependencies(build_config)
    normal_deps += SERVER_DEPENDENCIES

    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
@ -103,6 +109,11 @@ def build_image(
            container_base,
            " ".join(normal_deps),
        ]
+
+        # When building from a config file (not a template), include the run config path in the
+        # build arguments
+        if run_config is not None:
+            args.append(run_config)
    elif build_config.image_type == LlamaStackImageType.CONDA.value:
        script = str(importlib.resources.files("llama_stack") / "distribution/build_conda_env.sh")
        args = [
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -19,12 +19,16 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 # mounting is not supported by docker buildx, so we use COPY instead
 USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}

+# Path to the run.yaml file in the container
+RUN_CONFIG_PATH=/app/run.yaml
+
+BUILD_CONTEXT_DIR=$(pwd)
+
 if [ "$#" -lt 4 ]; then
  # This only works for templates
-  echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<special_pip_deps>]" >&2
+  echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<run_config>] [<special_pip_deps>]" >&2
  exit 1
 fi
-
 set -euo pipefail

 template_or_config="$1"
@ -35,8 +39,27 @@ container_base="$1"
 shift
 pip_dependencies="$1"
 shift
-special_pip_deps="${1:-}"

+# Handle optional arguments
+run_config=""
+special_pip_deps=""
+
+# Check if there are more arguments
+# The logics is becoming cumbersom, we should refactor it if we can do better
+if [ $# -gt 0 ]; then
+  # Check if the argument ends with .yaml
+  if [[ "$1" == *.yaml ]]; then
+    run_config="$1"
+    shift
+    # If there's another argument after .yaml, it must be special_pip_deps
+    if [ $# -gt 0 ]; then
+      special_pip_deps="$1"
+    fi
+  else
+    # If it's not .yaml, it must be special_pip_deps
+    special_pip_deps="$1"
+  fi
+fi

 # Define color codes
 RED='\033[0;31m'
@ -72,9 +95,13 @@ if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
 FROM $container_base
 WORKDIR /app

-RUN dnf -y update && dnf install -y iputils net-tools wget \
+# We install the Python 3.11 dev headers and build tools so that any
+# C‑extension wheels (e.g. polyleven, faiss‑cpu) can compile successfully.
+
+RUN dnf -y update && dnf install -y iputils git net-tools wget \
    vim-minimal python3.11 python3.11-pip python3.11-wheel \
-    python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all
+    python3.11-setuptools python3.11-devel gcc make && \
+    ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all

 ENV UV_SYSTEM_PYTHON=1
 RUN pip install uv
@ -86,7 +113,7 @@ WORKDIR /app

 RUN apt-get update && apt-get install -y \
       iputils-ping net-tools iproute2 dnsutils telnet \
-       curl wget telnet \
+       curl wget telnet git\
       procps psmisc lsof \
       traceroute \
       bubblewrap \
@ -115,6 +142,45 @@ EOF
  done
 fi

+# Function to get Python command
+get_python_cmd() {
+    if is_command_available python; then
+        echo "python"
+    elif is_command_available python3; then
+        echo "python3"
+    else
+        echo "Error: Neither python nor python3 is installed. Please install Python to continue." >&2
+        exit 1
+    fi
+}
+
+if [ -n "$run_config" ]; then
+  # Copy the run config to the build context since it's an absolute path
+  cp "$run_config" "$BUILD_CONTEXT_DIR/run.yaml"
+  add_to_container << EOF
+COPY run.yaml $RUN_CONFIG_PATH
+EOF
+
+  # Parse the run.yaml configuration to identify external provider directories
+  # If external providers are specified, copy their directory to the container
+  # and update the configuration to reference the new container path
+  python_cmd=$(get_python_cmd)
+  external_providers_dir=$($python_cmd -c "import yaml; config = yaml.safe_load(open('$run_config')); print(config.get('external_providers_dir') or '')")
+  if [ -n "$external_providers_dir" ]; then
+    echo "Copying external providers directory: $external_providers_dir"
+    add_to_container << EOF
+COPY $external_providers_dir /app/providers.d
+EOF
+    # Edit the run.yaml file to change the external_providers_dir to /app/providers.d
+    if [ "$(uname)" = "Darwin" ]; then
+      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      rm -f "$BUILD_CONTEXT_DIR/run.yaml.bak"
+    else
+      sed -i 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+    fi
+  fi
+fi
+
 stack_mount="/app/llama-stack-source"
 client_mount="/app/llama-stack-client-source"

@ -174,15 +240,16 @@ fi
 RUN pip uninstall -y uv
 EOF

-# if template_or_config ends with .yaml, it is not a template and we should not use the --template flag
-if [[ "$template_or_config" != *.yaml ]]; then
+# If a run config is provided, we use the --config flag
+if [[ -n "$run_config" ]]; then
+  add_to_container << EOF
+ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--config", "$RUN_CONFIG_PATH"]
+EOF
+# If a template is provided (not a yaml file), we use the --template flag
+elif [[ "$template_or_config" != *.yaml ]]; then
  add_to_container << EOF
 ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$template_or_config"]
 EOF
-else
-  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
-EOF
 fi

 # Add other require item commands genearic to all containers
@ -254,9 +321,10 @@ $CONTAINER_BINARY build \
  "${CLI_ARGS[@]}" \
  -t "$image_tag" \
  -f "$TEMP_DIR/Containerfile" \
-  "."
+  "$BUILD_CONTEXT_DIR"

 # clean up tmp/configs
+rm -f "$BUILD_CONTEXT_DIR/run.yaml"
 set +x

 echo "Success!"
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -326,3 +326,12 @@ class BuildConfig(BaseModel):
        default="conda",
        description="Type of package to build (conda | container | venv)",
    )
+    image_name: Optional[str] = Field(
+        default=None,
+        description="Name of the distribution to build",
+    )
+    external_providers_dir: Optional[str] = Field(
+        default=None,
+        description="Path to directory containing external provider implementations. The providers packages will be resolved from this directory. "
+        "pip_packages MUST contain the provider package name.",
+    )
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@ -12,7 +12,6 @@ from typing import Any, Dict, List
 import yaml
 from pydantic import BaseModel

-from llama_stack.distribution.datatypes import StackRunConfig
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
    AdapterSpec,
@ -97,7 +96,9 @@ def _load_inline_provider_spec(spec_data: Dict[str, Any], api: Api, provider_nam
    return spec


-def get_provider_registry(config: StackRunConfig | None = None) -> Dict[Api, Dict[str, ProviderSpec]]:
+def get_provider_registry(
+    config=None,
+) -> Dict[Api, Dict[str, ProviderSpec]]:
    """Get the provider registry, optionally including external providers.

    This function loads both built-in providers and external providers from YAML files.
@ -122,7 +123,7 @@ def get_provider_registry(config: StackRunConfig | None = None) -> Dict[Api, Dic
          llama-guard.yaml

    Args:
-        config: Optional StackRunConfig containing the external providers directory path
+        config: Optional object containing the external providers directory path

    Returns:
        A dictionary mapping APIs to their available providers
@ -142,7 +143,8 @@ def get_provider_registry(config: StackRunConfig | None = None) -> Dict[Api, Dic
        except ImportError as e:
            logger.warning(f"Failed to import module {name}: {e}")

-    if config and config.external_providers_dir:
+    # Check if config has the external_providers_dir attribute
+    if config and hasattr(config, "external_providers_dir") and config.external_providers_dir:
        external_providers_dir = os.path.abspath(config.external_providers_dir)
        if not os.path.exists(external_providers_dir):
            raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -8,6 +8,11 @@ import asyncio
 import time
 from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

+from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
+from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
+from pydantic import Field, TypeAdapter
+from typing_extensions import Annotated
+
 from llama_stack.apis.common.content_types import (
    URL,
    InterleavedContent,
@ -526,7 +531,7 @@ class InferenceRouter(Inference):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
+        messages: Annotated[List[OpenAIMessageParam], Field(..., min_length=1)],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
@ -558,6 +563,16 @@ class InferenceRouter(Inference):
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")

+        # Use the OpenAI client for a bit of extra input validation without
+        # exposing the OpenAI client itself as part of our API surface
+        if tool_choice:
+            TypeAdapter(OpenAIChatCompletionToolChoiceOptionParam).validate_python(tool_choice)
+            if tools is None:
+                raise ValueError("'tool_choice' is only allowed when 'tools' is also provided")
+        if tools:
+            for tool in tools:
+                TypeAdapter(OpenAIChatCompletionToolParam).validate_python(tool)
+
        params = dict(
            model=model_obj.identifier,
            messages=messages,
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -22,6 +22,7 @@ from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, StreamingResponse
+from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
 from typing_extensions import Annotated

@ -92,7 +93,7 @@ async def global_exception_handler(request: Request, exc: Exception):

 def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidationError]:
    if isinstance(exc, ValidationError):
-        exc = RequestValidationError(exc.raw_errors)
+        exc = RequestValidationError(exc.errors())

    if isinstance(exc, RequestValidationError):
        return HTTPException(
@ -110,6 +111,8 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio
        )
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=400, detail=f"Invalid value: {str(exc)}")
+    elif isinstance(exc, BadRequestError):
+        return HTTPException(status_code=400, detail=str(exc))
    elif isinstance(exc, PermissionError):
        return HTTPException(status_code=403, detail=f"Permission denied: {str(exc)}")
    elif isinstance(exc, TimeoutError):
@ -162,14 +165,17 @@ async def maybe_await(value):
    return value


-async def sse_generator(event_gen):
+async def sse_generator(event_gen_coroutine):
+    event_gen = None
    try:
-        async for item in await event_gen:
+        event_gen = await event_gen_coroutine
+        async for item in event_gen:
            yield create_sse_event(item)
            await asyncio.sleep(0.01)
    except asyncio.CancelledError:
        logger.info("Generator cancelled")
-        await event_gen.aclose()
+        if event_gen:
+            await event_gen.aclose()
    except Exception as e:
        logger.exception("Error in sse_generator")
        yield create_sse_event(
@ -455,6 +461,7 @@ def main(args: Optional[argparse.Namespace] = None):
                raise ValueError(f"Could not find method {endpoint.name} on {impl}!!")

            impl_method = getattr(impl, endpoint.name)
+            logger.debug(f"{endpoint.method.upper()} {endpoint.route}")

            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._fields")
--- a/llama_stack/distribution/ui/page/playground/rag.py
+++ b/llama_stack/distribution/ui/page/playground/rag.py
@ -24,6 +24,13 @@ def rag_chat_page():
    def should_disable_input():
        return "displayed_messages" in st.session_state and len(st.session_state.displayed_messages) > 0

+    def log_message(message):
+        with st.chat_message(message["role"]):
+            if "tool_output" in message and message["tool_output"]:
+                with st.expander(label="Tool Output", expanded=False, icon="🛠"):
+                    st.write(message["tool_output"])
+            st.markdown(message["content"])
+
    with st.sidebar:
        # File/Directory Upload Section
        st.subheader("Upload Documents", divider=True)
@ -146,8 +153,7 @@ def rag_chat_page():

    # Display chat history
    for message in st.session_state.displayed_messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
+        log_message(message)

    if temperature > 0.0:
        strategy = {
@ -201,7 +207,7 @@ def rag_chat_page():

        # Display assistant response
        with st.chat_message("assistant"):
-            retrieval_message_placeholder = st.empty()
+            retrieval_message_placeholder = st.expander(label="Tool Output", expanded=False, icon="🛠")
            message_placeholder = st.empty()
            full_response = ""
            retrieval_response = ""
@ -209,14 +215,16 @@ def rag_chat_page():
                log.print()
                if log.role == "tool_execution":
                    retrieval_response += log.content.replace("====", "").strip()
-                    retrieval_message_placeholder.info(retrieval_response)
+                    retrieval_message_placeholder.write(retrieval_response)
                else:
                    full_response += log.content
                    message_placeholder.markdown(full_response + "▌")
            message_placeholder.markdown(full_response)

            st.session_state.messages.append({"role": "assistant", "content": full_response})
-            st.session_state.displayed_messages.append({"role": "assistant", "content": full_response})
+            st.session_state.displayed_messages.append(
+                {"role": "assistant", "content": full_response, "tool_output": retrieval_response}
+            )

    def direct_process_prompt(prompt):
        # Add the system prompt in the beginning of the conversation
@ -230,15 +238,14 @@ def rag_chat_page():
        prompt_context = rag_response.content

        with st.chat_message("assistant"):
+            with st.expander(label="Retrieval Output", expanded=False):
+                st.write(prompt_context)
+
            retrieval_message_placeholder = st.empty()
            message_placeholder = st.empty()
            full_response = ""
            retrieval_response = ""

-            # Display the retrieved content
-            retrieval_response += str(prompt_context)
-            retrieval_message_placeholder.info(retrieval_response)
-
            # Construct the extended prompt
            extended_prompt = f"Please answer the following query using the context below.\n\nCONTEXT:\n{prompt_context}\n\nQUERY:\n{prompt}"

--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@ -4,14 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import enum
+import json
 import uuid

 import streamlit as st
 from llama_stack_client import Agent
+from llama_stack_client.lib.agents.react.agent import ReActAgent
+from llama_stack_client.lib.agents.react.tool_parser import ReActOutput

 from llama_stack.distribution.ui.modules.api import llama_stack_api


+class AgentType(enum.Enum):
+    REGULAR = "Regular"
+    REACT = "ReAct"
+
+
 def tool_chat_page():
    st.title("🛠 Tools")

@ -23,50 +32,117 @@ def tool_chat_page():
    tool_groups_list = [tool_group.identifier for tool_group in tool_groups]
    mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")]
    builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")]
+    selected_vector_dbs = []

    def reset_agent():
        st.session_state.clear()
        st.cache_resource.clear()

    with st.sidebar:
+        st.title("Configuration")
        st.subheader("Model")
-        model = st.selectbox(label="models", options=model_list, on_change=reset_agent)
+        model = st.selectbox(label="Model", options=model_list, on_change=reset_agent, label_visibility="collapsed")
+
+        st.subheader("Available ToolGroups")

-        st.subheader("Builtin Tools")
        toolgroup_selection = st.pills(
-            label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
+            label="Built-in tools",
+            options=builtin_tools_list,
+            selection_mode="multi",
+            on_change=reset_agent,
+            format_func=lambda tool: "".join(tool.split("::")[1:]),
+            help="List of built-in tools from your llama stack server.",
        )

-        st.subheader("MCP Servers")
+        if "builtin::rag" in toolgroup_selection:
+            vector_dbs = llama_stack_api.client.vector_dbs.list() or []
+            if not vector_dbs:
+                st.info("No vector databases available for selection.")
+            vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
+            selected_vector_dbs = st.multiselect(
+                label="Select Document Collections to use in RAG queries",
+                options=vector_dbs,
+                on_change=reset_agent,
+            )
+
        mcp_selection = st.pills(
-            label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
+            label="MCP Servers",
+            options=mcp_tools_list,
+            selection_mode="multi",
+            on_change=reset_agent,
+            format_func=lambda tool: "".join(tool.split("::")[1:]),
+            help="List of MCP servers registered to your llama stack server.",
        )

        toolgroup_selection.extend(mcp_selection)

-        active_tool_list = []
-        for toolgroup_id in toolgroup_selection:
-            active_tool_list.extend(
-                [
-                    f"{''.join(toolgroup_id.split('::')[1:])}:{t.identifier}"
-                    for t in client.tools.list(toolgroup_id=toolgroup_id)
-                ]
-            )
+        grouped_tools = {}
+        total_tools = 0

-        st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
-        st.json(active_tool_list)
+        for toolgroup_id in toolgroup_selection:
+            tools = client.tools.list(toolgroup_id=toolgroup_id)
+            grouped_tools[toolgroup_id] = [tool.identifier for tool in tools]
+            total_tools += len(tools)
+
+        st.markdown(f"Active Tools: 🛠 {total_tools}")
+
+        for group_id, tools in grouped_tools.items():
+            with st.expander(f"🔧 Tools from `{group_id}`"):
+                for idx, tool in enumerate(tools, start=1):
+                    st.markdown(f"{idx}. `{tool.split(':')[-1]}`")
+
+        st.subheader("Agent Configurations")
+        st.subheader("Agent Type")
+        agent_type = st.radio(
+            "Select Agent Type",
+            [AgentType.REGULAR, AgentType.REACT],
+            format_func=lambda x: x.value,
+            on_change=reset_agent,
+        )
+
+        max_tokens = st.slider(
+            "Max Tokens",
+            min_value=0,
+            max_value=4096,
+            value=512,
+            step=64,
+            help="The maximum number of tokens to generate",
+            on_change=reset_agent,
+        )
+
+    for i, tool_name in enumerate(toolgroup_selection):
+        if tool_name == "builtin::rag":
+            tool_dict = dict(
+                name="builtin::rag",
+                args={
+                    "vector_db_ids": list(selected_vector_dbs),
+                },
+            )
+            toolgroup_selection[i] = tool_dict

    @st.cache_resource
    def create_agent():
-        return Agent(
-            client,
-            model=model,
-            instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
-            tools=toolgroup_selection,
-            sampling_params={
-                "strategy": {"type": "greedy"},
-            },
-        )
+        if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT:
+            return ReActAgent(
+                client=client,
+                model=model,
+                tools=toolgroup_selection,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": ReActOutput.model_json_schema(),
+                },
+                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
+            )
+        else:
+            return Agent(
+                client,
+                model=model,
+                instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
+                tools=toolgroup_selection,
+                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
+            )
+
+    st.session_state.agent_type = agent_type

    agent = create_agent()

@ -95,6 +171,158 @@ def tool_chat_page():
        )

        def response_generator(turn_response):
+            if st.session_state.get("agent_type") == AgentType.REACT:
+                return _handle_react_response(turn_response)
+            else:
+                return _handle_regular_response(turn_response)
+
+        def _handle_react_response(turn_response):
+            current_step_content = ""
+            final_answer = None
+            tool_results = []
+
+            for response in turn_response:
+                if not hasattr(response.event, "payload"):
+                    yield (
+                        "\n\n🚨 :red[_Llama Stack server Error:_]\n"
+                        "The response received is missing an expected `payload` attribute.\n"
+                        "This could indicate a malformed response or an internal issue within the server.\n\n"
+                        f"Error details: {response}"
+                    )
+                    return
+
+                payload = response.event.payload
+
+                if payload.event_type == "step_progress" and hasattr(payload.delta, "text"):
+                    current_step_content += payload.delta.text
+                    continue
+
+                if payload.event_type == "step_complete":
+                    step_details = payload.step_details
+
+                    if step_details.step_type == "inference":
+                        yield from _process_inference_step(current_step_content, tool_results, final_answer)
+                        current_step_content = ""
+                    elif step_details.step_type == "tool_execution":
+                        tool_results = _process_tool_execution(step_details, tool_results)
+                        current_step_content = ""
+                    else:
+                        current_step_content = ""
+
+            if not final_answer and tool_results:
+                yield from _format_tool_results_summary(tool_results)
+
+        def _process_inference_step(current_step_content, tool_results, final_answer):
+            try:
+                react_output_data = json.loads(current_step_content)
+                thought = react_output_data.get("thought")
+                action = react_output_data.get("action")
+                answer = react_output_data.get("answer")
+
+                if answer and answer != "null" and answer is not None:
+                    final_answer = answer
+
+                if thought:
+                    with st.expander("🤔 Thinking...", expanded=False):
+                        st.markdown(f":grey[__{thought}__]")
+
+                if action and isinstance(action, dict):
+                    tool_name = action.get("tool_name")
+                    tool_params = action.get("tool_params")
+                    with st.expander(f'🛠 Action: Using tool "{tool_name}"', expanded=False):
+                        st.json(tool_params)
+
+                if answer and answer != "null" and answer is not None:
+                    yield f"\n\n✅ **Final Answer:**\n{answer}"
+
+            except json.JSONDecodeError:
+                yield f"\n\nFailed to parse ReAct step content:\n```json\n{current_step_content}\n```"
+            except Exception as e:
+                yield f"\n\nFailed to process ReAct step: {e}\n```json\n{current_step_content}\n```"
+
+            return final_answer
+
+        def _process_tool_execution(step_details, tool_results):
+            try:
+                if hasattr(step_details, "tool_responses") and step_details.tool_responses:
+                    for tool_response in step_details.tool_responses:
+                        tool_name = tool_response.tool_name
+                        content = tool_response.content
+                        tool_results.append((tool_name, content))
+                        with st.expander(f'⚙️ Observation (Result from "{tool_name}")', expanded=False):
+                            try:
+                                parsed_content = json.loads(content)
+                                st.json(parsed_content)
+                            except json.JSONDecodeError:
+                                st.code(content, language=None)
+                else:
+                    with st.expander("⚙️ Observation", expanded=False):
+                        st.markdown(":grey[_Tool execution step completed, but no response data found._]")
+            except Exception as e:
+                with st.expander("⚙️ Error in Tool Execution", expanded=False):
+                    st.markdown(f":red[_Error processing tool execution: {str(e)}_]")
+
+            return tool_results
+
+        def _format_tool_results_summary(tool_results):
+            yield "\n\n**Here's what I found:**\n"
+            for tool_name, content in tool_results:
+                try:
+                    parsed_content = json.loads(content)
+
+                    if tool_name == "web_search" and "top_k" in parsed_content:
+                        yield from _format_web_search_results(parsed_content)
+                    elif "results" in parsed_content and isinstance(parsed_content["results"], list):
+                        yield from _format_results_list(parsed_content["results"])
+                    elif isinstance(parsed_content, dict) and len(parsed_content) > 0:
+                        yield from _format_dict_results(parsed_content)
+                    elif isinstance(parsed_content, list) and len(parsed_content) > 0:
+                        yield from _format_list_results(parsed_content)
+                except json.JSONDecodeError:
+                    yield f"\n**{tool_name}** was used but returned complex data. Check the observation for details.\n"
+                except (TypeError, AttributeError, KeyError, IndexError) as e:
+                    print(f"Error processing {tool_name} result: {type(e).__name__}: {e}")
+
+        def _format_web_search_results(parsed_content):
+            for i, result in enumerate(parsed_content["top_k"], 1):
+                if i <= 3:
+                    title = result.get("title", "Untitled")
+                    url = result.get("url", "")
+                    content_text = result.get("content", "").strip()
+                    yield f"\n- **{title}**\n  {content_text}\n  [Source]({url})\n"
+
+        def _format_results_list(results):
+            for i, result in enumerate(results, 1):
+                if i <= 3:
+                    if isinstance(result, dict):
+                        name = result.get("name", result.get("title", "Result " + str(i)))
+                        description = result.get("description", result.get("content", result.get("summary", "")))
+                        yield f"\n- **{name}**\n  {description}\n"
+                    else:
+                        yield f"\n- {result}\n"
+
+        def _format_dict_results(parsed_content):
+            yield "\n```\n"
+            for key, value in list(parsed_content.items())[:5]:
+                if isinstance(value, str) and len(value) < 100:
+                    yield f"{key}: {value}\n"
+                else:
+                    yield f"{key}: [Complex data]\n"
+            yield "```\n"
+
+        def _format_list_results(parsed_content):
+            yield "\n"
+            for _, item in enumerate(parsed_content[:3], 1):
+                if isinstance(item, str):
+                    yield f"- {item}\n"
+                elif isinstance(item, dict) and "text" in item:
+                    yield f"- {item['text']}\n"
+                elif isinstance(item, dict) and len(item) > 0:
+                    first_value = next(iter(item.values()))
+                    if isinstance(first_value, str) and len(first_value) < 100:
+                        yield f"- {first_value}\n"
+
+        def _handle_regular_response(turn_response):
            for response in turn_response:
                if hasattr(response.event, "payload"):
                    print(response.event.payload)
@ -103,14 +331,18 @@ def tool_chat_page():
                            yield response.event.payload.delta.text
                    if response.event.payload.event_type == "step_complete":
                        if response.event.payload.step_details.step_type == "tool_execution":
-                            yield " 🛠 "
+                            if response.event.payload.step_details.tool_calls:
+                                tool_name = str(response.event.payload.step_details.tool_calls[0].tool_name)
+                                yield f'\n\n🛠 :grey[_Using "{tool_name}" tool:_]\n\n'
+                            else:
+                                yield "No tool_calls present in step_details"
                else:
                    yield f"Error occurred in the Llama Stack Cluster: {response}"

        with st.chat_message("assistant"):
-            response = st.write_stream(response_generator(turn_response))
+            response_content = st.write_stream(response_generator(turn_response))

-        st.session_state.messages.append({"role": "assistant", "content": response})
+        st.session_state.messages.append({"role": "assistant", "content": response_content})


 tool_chat_page()
--- a/llama_stack/models/llama/llama4/chat_format.py
+++ b/llama_stack/models/llama/llama4/chat_format.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import io
+import json
 import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
@ -299,8 +300,10 @@ class ChatFormat:
                    call_id=call_id,
                    tool_name=tool_name,
                    arguments=tool_arguments,
+                    arguments_json=json.dumps(tool_arguments),
                )
            )
+            content = ""

        return RawMessage(
            role="assistant",
--- a/llama_stack/models/llama/llama4/prompt_format.md
+++ b/llama_stack/models/llama/llama4/prompt_format.md
@ -64,7 +64,7 @@ This example passes an image that is smaller than the tile size, to show the til

 ##### Model Response Format
 ```
-The image depicts a dog standing on a skateboard, with its front paws positioned on the board and its back paws hanging off the back. The dog has a distinctive coat pattern, featuring a white face, brown and black fur, and white paws, and is standing on a skateboard with red wheels, set against a blurred background of a street or alleyway with a teal door and beige wall.<|eot|>
+The image depicts a dog standing on a skateboard, positioned centrally and facing the camera directly. The dog has a distinctive coat pattern featuring white, black, and brown fur, with floppy ears and a black nose, and is standing on a skateboard with red wheels.<|eot|>
 ```


@ -91,7 +91,7 @@ Here is an example of how to pass an image to the model

 ##### Model Response Format
 ```
-This image shows a dog standing on a skateboard, with its front paws positioned near the front of the board and its back paws near the back. The dog has a white, black, and orange coat, and is standing on a gray skateboard with red wheels, in front of a blurred background that appears to be a street or alleyway.<|eot|>
+The image depicts a dog standing on a skateboard, with the dog positioned centrally and facing forward. The dog has a distinctive coat featuring a mix of white, brown, and black fur, and is wearing a collar as it stands on the skateboard, which has red wheels.<|eot|>
 ```


@ -117,7 +117,7 @@ Here is an example of how to pass an image to the model

 ##### Model Response Format
 ```
-The first image shows a dog standing on a skateboard, while the second image shows a plate of spaghetti with tomato sauce, parmesan cheese, and parsley. The two images are unrelated, with the first image featuring a dog and the second image featuring a food dish, and they do not share any common elements or themes.<|eot|>
+The first image features a dog standing on a skateboard, while the second image showcases a plate of spaghetti with tomato sauce and cheese. The two images appear to be unrelated, with one depicting a playful scene of a dog on a skateboard and the other presenting a classic Italian dish.<|eom|>
 ```


@ -135,13 +135,44 @@ We are continuing the format for zero shot function calling used in previous ver
 ```
 <|begin_of_text|><|header_start|>system<|header_end|>

-You are an expert in composing functions. You are given a question and a set of possible functions.
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
-also point it out. You should only return the function call in tools call sections.
+You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:
+
+1. FUNCTION CALLS:
+- ONLY use functions that are EXPLICITLY listed in the function list below
+- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+- If a function is not in the list, respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)
+- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]
+Examples:
+CORRECT: [get_weather(location="Vancouver"), calculate_route(start="Boston", end="New York")] <- Only if get_weather and calculate_route are in function list
+INCORRECT: get_weather(location="New York")
+INCORRECT: Let me check the weather: [get_weather(location="New York")]
+INCORRECT: [get_events(location="Singapore")] <- If function not in list
+
+2. RESPONSE RULES:
+- For pure function requests matching a listed function: ONLY output the function call(s)
+- For knowledge questions: ONLY output text
+- For missing parameters: ONLY request the specific missing parameters
+- For unavailable services (not in function list): output ONLY with internal knowledge or "I don't have access to [Unavailable service] information". Do NOT execute a function call.
+- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations
+- NEVER combine text and function calls in the same response
+- NEVER suggest alternative functions when the requested service is unavailable
+- NEVER create or invent new functions not listed below
+
+3. STRICT BOUNDARIES:
+- ONLY use functions from the list below - no exceptions
+- NEVER use a function as an alternative to unavailable information
+- NEVER call functions not present in the function list
+- NEVER add explanatory text to function calls
+- NEVER respond with empty brackets
+- Use proper Python/JSON syntax for function calls
+- Check the function list carefully before responding
+
+4. TOOL RESPONSE HANDLING:
+- When receiving tool responses: provide concise, natural language responses
+- Don't repeat tool response verbatim
+- Don't add supplementary information

-If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-You SHOULD NOT include any other text in the response.

 Here is a list of functions in JSON format that you can invoke.

@ -151,9 +182,7 @@ Here is a list of functions in JSON format that you can invoke.
        "description": "Get weather info for places",
        "parameters": {
            "type": "dict",
-            "required": [
-                "city"
-            ],
+            "required": ["city"],
            "properties": {
                "city": {
                    "type": "string",
@ -167,7 +196,10 @@ Here is a list of functions in JSON format that you can invoke.
            }
        }
    }
-<|eot|><|header_start|>user<|header_end|>
+]
+
+You can answer general questions or invoke tools when necessary.
+In addition to tool calls, you should also augment your responses by using the tool outputs.<|eot|><|header_start|>user<|header_end|>

 What is the weather in SF and Seattle?<|eot|><|header_start|>assistant<|header_end|>

@ -176,7 +208,7 @@ What is the weather in SF and Seattle?<|eot|><|header_start|>assistant<|header_e

 ##### Model Response Format
 ```
-[get_weather(city='SF'), get_weather(city='Seattle')]<|eot|>
+[get_weather(city="San Francisco"), get_weather(city="Seattle")]<|eot|>
 ```


@ -273,5 +305,5 @@ Use tools to get latest trending songs<|eot|><|header_start|>assistant<|header_e

 ##### Model Response Format
 ```
-<function=trending_songs>{"n": "10"}</function><|eot|>
+<function=trending_songs>{"n": 10}</function><|eot|>
 ```
--- a/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+import textwrap
+from typing import List, Optional
+
+from llama_stack.apis.inference import ToolDefinition, ToolParamDefinition
+from llama_stack.models.llama.llama3.prompt_templates.base import (
+    PromptTemplate,
+    PromptTemplateGeneratorBase,
+)
+
+
+class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
+    DEFAULT_PROMPT = textwrap.dedent(
+        """
+        You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:
+
+        1. FUNCTION CALLS:
+        - ONLY use functions that are EXPLICITLY listed in the function list below
+        - If NO functions are listed (empty function list []), respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+        - If a function is not in the list, respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+        - If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)
+        - Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]
+        Examples:
+        CORRECT: [get_weather(location="Vancouver"), calculate_route(start="Boston", end="New York")] <- Only if get_weather and calculate_route are in function list
+        INCORRECT: get_weather(location="New York")
+        INCORRECT: Let me check the weather: [get_weather(location="New York")]
+        INCORRECT: [get_events(location="Singapore")] <- If function not in list
+
+        2. RESPONSE RULES:
+        - For pure function requests matching a listed function: ONLY output the function call(s)
+        - For knowledge questions: ONLY output text
+        - For missing parameters: ONLY request the specific missing parameters
+        - For unavailable services (not in function list): output ONLY with internal knowledge or "I don't have access to [Unavailable service] information". Do NOT execute a function call.
+        - If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations
+        - NEVER combine text and function calls in the same response
+        - NEVER suggest alternative functions when the requested service is unavailable
+        - NEVER create or invent new functions not listed below
+
+        3. STRICT BOUNDARIES:
+        - ONLY use functions from the list below - no exceptions
+        - NEVER use a function as an alternative to unavailable information
+        - NEVER call functions not present in the function list
+        - NEVER add explanatory text to function calls
+        - NEVER respond with empty brackets
+        - Use proper Python/JSON syntax for function calls
+        - Check the function list carefully before responding
+
+        4. TOOL RESPONSE HANDLING:
+        - When receiving tool responses: provide concise, natural language responses
+        - Don't repeat tool response verbatim
+        - Don't add supplementary information
+
+
+        {{ function_description }}
+        """.strip("\n")
+    )
+
+    def gen(self, custom_tools: List[ToolDefinition], system_prompt: Optional[str] = None) -> PromptTemplate:
+        system_prompt = system_prompt or self.DEFAULT_PROMPT
+        return PromptTemplate(
+            system_prompt,
+            {"function_description": self._gen_function_description(custom_tools)},
+        )
+
+    def _gen_function_description(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
+        template_str = textwrap.dedent(
+            """
+            Here is a list of functions in JSON format that you can invoke.
+
+            [
+                {% for t in tools -%}
+                {# manually setting up JSON because jinja sorts keys in unexpected ways -#}
+                {%- set tname = t.tool_name -%}
+                {%- set tdesc = t.description -%}
+                {%- set tparams = t.parameters -%}
+                {%- set required_params = [] -%}
+                {%- for name, param in tparams.items() if param.required == true -%}
+                    {%- set _ = required_params.append(name) -%}
+                {%- endfor -%}
+                {
+                    "name": "{{tname}}",
+                    "description": "{{tdesc}}",
+                    "parameters": {
+                        "type": "dict",
+                        "required": {{ required_params | tojson }},
+                        "properties": {
+                            {%- for name, param in tparams.items() %}
+                            "{{name}}": {
+                                "type": "{{param.param_type}}",
+                                "description": "{{param.description}}"{% if param.default %},
+                                "default": "{{param.default}}"{% endif %}
+                            }{% if not loop.last %},{% endif %}
+                            {%- endfor %}
+                        }
+                    }
+                }{% if not loop.last %},
+                {% endif -%}
+                {%- endfor %}
+            ]
+
+            You can answer general questions or invoke tools when necessary.
+            In addition to tool calls, you should also augment your responses by using the tool outputs.
+
+            """
+        )
+        return PromptTemplate(
+            template_str.strip("\n"),
+            {"tools": [t.model_dump() for t in custom_tools]},
+        ).render()
+
+    def data_examples(self) -> List[List[ToolDefinition]]:
+        return [
+            [
+                ToolDefinition(
+                    tool_name="get_weather",
+                    description="Get weather info for places",
+                    parameters={
+                        "city": ToolParamDefinition(
+                            param_type="string",
+                            description="The name of the city to get the weather for",
+                            required=True,
+                        ),
+                        "metric": ToolParamDefinition(
+                            param_type="string",
+                            description="The metric for weather. Options are: celsius, fahrenheit",
+                            required=False,
+                            default="celsius",
+                        ),
+                    },
+                ),
+            ]
+        ]
--- a/llama_stack/models/llama/llama4/prompts.py
+++ b/llama_stack/models/llama/llama4/prompts.py
@ -9,6 +9,10 @@ from io import BytesIO
 from pathlib import Path
 from typing import List

+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator,
+)
+
 from ..datatypes import RawMediaItem, RawMessage, RawTextItem
 from ..prompt_format import (
    Llama4UseCase,
@ -177,39 +181,9 @@ def usecases(base_model: bool = False) -> List[UseCase | str]:
                    [
                        RawMessage(
                            role="system",
-                            content="""You are an expert in composing functions. You are given a question and a set of possible functions.
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
-also point it out. You should only return the function call in tools call sections.
-
-If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-You SHOULD NOT include any other text in the response.
-
-Here is a list of functions in JSON format that you can invoke.
-
-[
-    {
-        "name": "get_weather",
-        "description": "Get weather info for places",
-        "parameters": {
-            "type": "dict",
-            "required": [
-                "city"
-            ],
-            "properties": {
-                "city": {
-                    "type": "string",
-                    "description": "The name of the city to get the weather for"
-                },
-                "metric": {
-                    "type": "string",
-                    "description": "The metric for weather. Options are: celsius, fahrenheit",
-                    "default": "celsius"
-                }
-            }
-        }
-    }
-""",
+                            content=PythonListCustomToolGenerator()
+                            .gen(PythonListCustomToolGenerator().data_examples()[0])
+                            .render(),
                        ),
                        RawMessage(
                            role="user",
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
            span.set_attribute("request", request.model_dump_json())
            turn_id = str(uuid.uuid4())
            span.set_attribute("turn_id", turn_id)
+            if self.agent_config.name:
+                span.set_attribute("agent_name", self.agent_config.name)

        await self._initialize_tools(request.toolgroups)
        async for chunk in self._run_turn(request, turn_id):
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
            span.set_attribute("session_id", request.session_id)
            span.set_attribute("request", request.model_dump_json())
            span.set_attribute("turn_id", request.turn_id)
+            if self.agent_config.name:
+                span.set_attribute("agent_name", self.agent_config.name)

        await self._initialize_tools()
        async for chunk in self._run_turn(request):
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
            stop_reason = None

            async with tracing.span("inference") as span:
+                if self.agent_config.name:
+                    span.set_attribute("agent_name", self.agent_config.name)
                async for chunk in await self.inference_api.chat_completion(
                    self.agent_config.model,
                    input_messages,
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -253,7 +253,8 @@ class MetaReferenceInferenceImpl(
        def impl():
            stop_reason = None

-            for token_result in self.generator.completion(request):
+            for token_results in self.generator.completion([request]):
+                token_result = token_results[0]
                if token_result.token == tokenizer.eot_id:
                    stop_reason = StopReason.end_of_turn
                    text = ""
@ -515,7 +516,8 @@ class MetaReferenceInferenceImpl(
            stop_reason = None
            ipython = False

-            for token_result in self.generator.chat_completion(request):
+            for token_results in self.generator.chat_completion([request]):
+                token_result = token_results[0]
                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
                    cprint(token_result.text, "cyan", end="")
                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -69,7 +69,10 @@ class CancelSentinel(BaseModel):

 class TaskRequest(BaseModel):
    type: Literal[ProcessingMessageName.task_request] = ProcessingMessageName.task_request
-    task: Tuple[str, List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent]]
+    task: Tuple[
+        str,
+        List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent],
+    ]


 class TaskResponse(BaseModel):
@ -231,10 +234,10 @@ def worker_process_entrypoint(
    while True:
        try:
            task = req_gen.send(result)
-            if isinstance(task, str) and task == EndSentinel():
+            if isinstance(task, EndSentinel):
                break

-            assert isinstance(task, TaskRequest)
+            assert isinstance(task, TaskRequest), task
            result = model(task.task)
        except StopIteration:
            break
@ -331,7 +334,10 @@ class ModelParallelProcessGroup:

    def run_inference(
        self,
-        req: Tuple[str, List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent]],
+        req: Tuple[
+            str,
+            List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent],
+        ],
    ) -> Generator:
        assert not self.running, "inference already running"

--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -33,6 +33,7 @@ from llama_stack.apis.tools import (
 )
 from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
 from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
    make_overlapped_chunks,
@ -153,6 +154,11 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                )
            )
        picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
+        picked.append(
+            TextContentItem(
+                text=f'The above results were retrieved to help answer the user\'s query: "{interleaved_content_as_str(content)}". Use them as supporting information only in answering this query.\n',
+            )
+        )

        return RAGQueryResult(
            content=picked,
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -288,4 +288,14 @@ def available_providers() -> List[ProviderSpec]:
                provider_data_validator="llama_stack.providers.remote.inference.passthrough.PassthroughProviderDataValidator",
            ),
        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="watsonx",
+                pip_packages=["ibm_watson_machine_learning"],
+                module="llama_stack.providers.remote.inference.watsonx",
+                config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
+            ),
+        ),
    ]
--- a/llama_stack/providers/remote/eval/nvidia/README.md
+++ b/llama_stack/providers/remote/eval/nvidia/README.md
@ -77,7 +77,7 @@ POST /eval/benchmarks/{benchmark_id}/jobs
  "benchmark_config": {
    "eval_candidate": {
      "type": "model",
-      "model": "meta/llama-3.1-8b-instruct",
+      "model": "meta-llama/Llama3.1-8B-Instruct",
      "sampling_params": {
        "max_tokens": 100,
        "temperature": 0.7
@ -91,7 +91,7 @@ POST /eval/benchmarks/{benchmark_id}/jobs
 Response example:
 ```json
 {
-    "job_id": "1234",
+    "job_id": "eval-1234",
    "status": "in_progress"
 }
 ```
@ -101,6 +101,14 @@ Response example:
 GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
 ```

+Response example:
+```json
+{
+  "job_id": "eval-1234",
+  "status": "in_progress"
+}
+```
+
 ### Example for cancelling a job
 ```
 POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
--- a/llama_stack/providers/remote/eval/nvidia/config.py
+++ b/llama_stack/providers/remote/eval/nvidia/config.py
@ -14,10 +14,10 @@ class NVIDIAEvalConfig(BaseModel):
     Configuration for the NVIDIA NeMo Evaluator microservice endpoint.

    Attributes:
-        evaluator_service_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
+        evaluator_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
    """

-    evaluator_service_url: str = Field(
+    evaluator_url: str = Field(
        default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
        description="The url for accessing the evaluator service",
    )
@ -25,5 +25,5 @@ class NVIDIAEvalConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
        return {
-            "evaluator_service_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
+            "evaluator_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
        }
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -53,13 +53,13 @@ class NVIDIAEvalImpl(

    async def _evaluator_get(self, path):
        """Helper for making GET requests to the evaluator service."""
-        response = requests.get(url=f"{self.config.evaluator_service_url}{path}")
+        response = requests.get(url=f"{self.config.evaluator_url}{path}")
        response.raise_for_status()
        return response.json()

    async def _evaluator_post(self, path, data):
        """Helper for making POST requests to the evaluator service."""
-        response = requests.post(url=f"{self.config.evaluator_service_url}{path}", json=data)
+        response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
        response.raise_for_status()
        return response.json()

--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -362,6 +362,39 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        user: Optional[str] = None,
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
        model_obj = await self.model_store.get_model(model)
+
+        # Divert Llama Models through Llama Stack inference APIs because
+        # Fireworks chat completions OpenAI-compatible API does not support
+        # tool calls properly.
+        llama_model = self.get_llama_model(model_obj.provider_resource_id)
+        if llama_model:
+            return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(
+                self,
+                model=model,
+                messages=messages,
+                frequency_penalty=frequency_penalty,
+                function_call=function_call,
+                functions=functions,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_completion_tokens=max_completion_tokens,
+                max_tokens=max_tokens,
+                n=n,
+                parallel_tool_calls=parallel_tool_calls,
+                presence_penalty=presence_penalty,
+                response_format=response_format,
+                seed=seed,
+                stop=stop,
+                stream=stream,
+                stream_options=stream_options,
+                temperature=temperature,
+                tool_choice=tool_choice,
+                tools=tools,
+                top_logprobs=top_logprobs,
+                top_p=top_p,
+                user=user,
+            )
+
        params = await prepare_openai_completion_params(
            messages=messages,
            frequency_penalty=frequency_penalty,
@ -387,11 +420,4 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
            user=user,
        )

-        # Divert Llama Models through Llama Stack inference APIs because
-        # Fireworks chat completions OpenAI-compatible API does not support
-        # tool calls properly.
-        llama_model = self.get_llama_model(model_obj.provider_resource_id)
-        if llama_model:
-            return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(self, model=model, **params)
-
        return await self._get_openai_client().chat.completions.create(model=model_obj.provider_resource_id, **params)
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -0,0 +1,85 @@
+# NVIDIA Inference Provider for LlamaStack
+
+This provider enables running inference using NVIDIA NIM.
+
+## Features
+- Endpoints for completions, chat completions, and embeddings for registered models
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to NVIDIA NIM deployment
+- NIM for model to use for inference is deployed
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = (
+    ""  # Required if using hosted NIM endpoint. If self-hosted, not required.
+)
+os.environ["NVIDIA_BASE_URL"] = "http://nim.test"  # NIM URL
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+### Create Completion
+
+```python
+response = client.completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    content="Complete the sentence using one word: Roses are red, violets are :",
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.content}")
+```
+
+### Create Chat Completion
+
+```python
+response = client.chat_completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    messages=[
+        {
+            "role": "system",
+            "content": "You must respond to each message with only one word",
+        },
+        {
+            "role": "user",
+            "content": "Complete the sentence using one word: Roses are red, violets are:",
+        },
+    ],
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.completion_message.content}")
+```
+
+### Create Embeddings
+```python
+response = client.embeddings(
+    model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
+)
+print(f"Embeddings: {response.embeddings}")
+```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.2-90b-vision-instruct",
        CoreModelId.llama3_2_90b_vision_instruct.value,
    ),
+    build_hf_repo_model_entry(
+        "meta/llama-3.3-70b-instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -129,6 +129,14 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
            base_url = special_model_urls[provider_model_id]
        return _get_client_for_base_url(base_url)

+    async def _get_provider_model_id(self, model_id: str) -> str:
+        if not self.model_store:
+            raise RuntimeError("Model store is not set")
+        model = await self.model_store.get_model(model_id)
+        if model is None:
+            raise ValueError(f"Model {model_id} is unknown")
+        return model.provider_model_id
+
    async def completion(
        self,
        model_id: str,
@ -147,7 +155,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        # removing this health check as NeMo customizer endpoint health check is returning 404
        # await check_health(self._config)  # this raises errors

-        provider_model_id = self.get_provider_model_id(model_id)
+        provider_model_id = await self._get_provider_model_id(model_id)
        request = convert_completion_request(
            request=CompletionRequest(
                model=provider_model_id,
@ -191,7 +199,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        #
        flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
        input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
-        model = self.get_provider_model_id(model_id)
+        provider_model_id = await self._get_provider_model_id(model_id)

        extra_body = {}

@ -214,8 +222,8 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
            extra_body["input_type"] = task_type_options[task_type]

        try:
-            response = await self._get_client(model).embeddings.create(
-                model=model,
+            response = await self._get_client(provider_model_id).embeddings.create(
+                model=provider_model_id,
                input=input,
                extra_body=extra_body,
            )
@ -249,11 +257,10 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):

        # await check_health(self._config)  # this raises errors

-        provider_model_id = self.get_provider_model_id(model_id)
-        print(f"provider_model_id: {provider_model_id}")
+        provider_model_id = await self._get_provider_model_id(model_id)
        request = await convert_chat_completion_request(
            request=ChatCompletionRequest(
-                model=self.get_provider_model_id(model_id),
+                model=provider_model_id,
                messages=messages,
                sampling_params=sampling_params,
                response_format=response_format,
@ -298,7 +305,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
-        provider_model_id = self.get_provider_model_id(model)
+        provider_model_id = await self._get_provider_model_id(model)

        params = await prepare_openai_completion_params(
            model=provider_model_id,
@ -351,7 +358,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
-        provider_model_id = self.get_provider_model_id(model)
+        provider_model_id = await self._get_provider_model_id(model)

        params = await prepare_openai_completion_params(
            model=provider_model_id,
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -76,8 +76,11 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi

    async def shutdown(self) -> None:
        if self._client:
-            await self._client.close()
+            # Together client has no close method, so just set to None
            self._client = None
+        if self._openai_client:
+            await self._openai_client.close()
+            self._openai_client = None

    async def completion(
        self,
@ -359,7 +362,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            top_p=top_p,
            user=user,
        )
-        if params.get("stream", True):
+        if params.get("stream", False):
            return self._stream_openai_chat_completion(params)
        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore

--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -231,12 +231,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self.client = None

    async def initialize(self) -> None:
-        log.info(f"Initializing VLLM client with base_url={self.config.url}")
-        self.client = AsyncOpenAI(
-            base_url=self.config.url,
-            api_key=self.config.api_token,
-            http_client=None if self.config.tls_verify else httpx.AsyncClient(verify=False),
-        )
+        pass

    async def shutdown(self) -> None:
        pass
@ -249,6 +244,20 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            raise ValueError("Model store not set")
        return await self.model_store.get_model(model_id)

+    def _lazy_initialize_client(self):
+        if self.client is not None:
+            return
+
+        log.info(f"Initializing vLLM client with base_url={self.config.url}")
+        self.client = self._create_client()
+
+    def _create_client(self):
+        return AsyncOpenAI(
+            base_url=self.config.url,
+            api_key=self.config.api_token,
+            http_client=None if self.config.tls_verify else httpx.AsyncClient(verify=False),
+        )
+
    async def completion(
        self,
        model_id: str,
@ -258,6 +267,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
+        self._lazy_initialize_client()
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self._get_model(model_id)
@ -287,6 +297,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        logprobs: Optional[LogProbConfig] = None,
        tool_config: Optional[ToolConfig] = None,
    ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
+        self._lazy_initialize_client()
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self._get_model(model_id)
@ -357,9 +368,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            yield chunk

    async def register_model(self, model: Model) -> Model:
-        assert self.client is not None
+        # register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet.
+        # self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
+        # Changing this may lead to unpredictable behavior.
+        client = self._create_client() if self.client is None else self.client
        model = await self.register_helper.register_model(model)
-        res = await self.client.models.list()
+        res = await client.models.list()
        available_models = [m.id async for m in res]
        if model.provider_resource_id not in available_models:
            raise ValueError(
@ -374,7 +388,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            options["max_tokens"] = self.config.max_tokens

        input_dict: dict[str, Any] = {}
-        if isinstance(request, ChatCompletionRequest) and request.tools is not None:
+        # Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
+        if isinstance(request, ChatCompletionRequest) and request.tools:
            input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}

        if isinstance(request, ChatCompletionRequest):
@ -409,6 +424,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        output_dimension: Optional[int] = None,
        task_type: Optional[EmbeddingTaskType] = None,
    ) -> EmbeddingsResponse:
+        self._lazy_initialize_client()
        assert self.client is not None
        model = await self._get_model(model_id)

@ -448,6 +464,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
+        self._lazy_initialize_client()
        model_obj = await self._get_model(model)

        extra_body: Dict[str, Any] = {}
@ -504,6 +521,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        self._lazy_initialize_client()
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
--- a/llama_stack/providers/remote/inference/watsonx/init.py
+++ b/llama_stack/providers/remote/inference/watsonx/init.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import WatsonXConfig
+
+
+async def get_adapter_impl(config: WatsonXConfig, _deps) -> Inference:
+    # import dynamically so `llama stack build` does not fail due to missing dependencies
+    from .watsonx import WatsonXInferenceAdapter
+
+    if not isinstance(config, WatsonXConfig):
+        raise RuntimeError(f"Unexpected config type: {type(config)}")
+    adapter = WatsonXInferenceAdapter(config)
+    return adapter
+
+
+__all__ = ["get_adapter_impl", "WatsonXConfig"]
--- a/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/llama_stack/providers/remote/inference/watsonx/config.py
@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field, SecretStr
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class WatsonXProviderDataValidator(BaseModel):
+    url: str
+    api_key: str
+    project_id: str
+
+
+@json_schema_type
+class WatsonXConfig(BaseModel):
+    url: str = Field(
+        default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
+        description="A base url for accessing the watsonx.ai",
+    )
+    api_key: Optional[SecretStr] = Field(
+        default_factory=lambda: os.getenv("WATSONX_API_KEY"),
+        description="The watsonx API key, only needed of using the hosted service",
+    )
+    project_id: Optional[str] = Field(
+        default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"),
+        description="The Project ID key, only needed of using the hosted service",
+    )
+    timeout: int = Field(
+        default=60,
+        description="Timeout for the HTTP requests",
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+        return {
+            "url": "${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}",
+            "api_key": "${env.WATSONX_API_KEY:}",
+            "project_id": "${env.WATSONX_PROJECT_ID:}",
+        }
--- a/llama_stack/providers/remote/inference/watsonx/models.py
+++ b/llama_stack/providers/remote/inference/watsonx/models.py
@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.models.llama.sku_types import CoreModelId
+from llama_stack.providers.utils.inference.model_registry import build_hf_repo_model_entry
+
+MODEL_ENTRIES = [
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-3-70b-instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-2-13b-chat",
+        CoreModelId.llama2_13b.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-1-70b-instruct",
+        CoreModelId.llama3_1_70b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-1-8b-instruct",
+        CoreModelId.llama3_1_8b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-2-11b-vision-instruct",
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-2-1b-instruct",
+        CoreModelId.llama3_2_1b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-2-3b-instruct",
+        CoreModelId.llama3_2_3b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-3-2-90b-vision-instruct",
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/llama-guard-3-11b-vision",
+        CoreModelId.llama_guard_3_11b_vision.value,
+    ),
+]
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -0,0 +1,378 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+
+from ibm_watson_machine_learning.foundation_models import Model
+from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from openai import AsyncOpenAI
+
+from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
+from llama_stack.apis.inference import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    CompletionRequest,
+    EmbeddingsResponse,
+    EmbeddingTaskType,
+    Inference,
+    LogProbConfig,
+    Message,
+    ResponseFormat,
+    SamplingParams,
+    TextTruncation,
+    ToolChoice,
+    ToolConfig,
+    ToolDefinition,
+    ToolPromptFormat,
+)
+from llama_stack.apis.inference.inference import (
+    GreedySamplingStrategy,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
+)
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAICompatCompletionChoice,
+    OpenAICompatCompletionResponse,
+    prepare_openai_completion_params,
+    process_chat_completion_response,
+    process_chat_completion_stream_response,
+    process_completion_response,
+    process_completion_stream_response,
+)
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    chat_completion_request_to_prompt,
+    completion_request_to_prompt,
+    request_has_media,
+)
+
+from . import WatsonXConfig
+from .models import MODEL_ENTRIES
+
+
+class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
+    def __init__(self, config: WatsonXConfig) -> None:
+        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
+
+        print(f"Initializing watsonx InferenceAdapter({config.url})...")
+
+        self._config = config
+
+        self._project_id = self._config.project_id
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(
+        self,
+        model_id: str,
+        content: InterleavedContent,
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        model = await self.model_store.get_model(model_id)
+        request = CompletionRequest(
+            model=model.provider_resource_id,
+            content=content,
+            sampling_params=sampling_params,
+            response_format=response_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+        if stream:
+            return self._stream_completion(request)
+        else:
+            return await self._nonstream_completion(request)
+
+    def _get_client(self, model_id) -> Model:
+        config_api_key = self._config.api_key.get_secret_value() if self._config.api_key else None
+        config_url = self._config.url
+        project_id = self._config.project_id
+        credentials = {"url": config_url, "apikey": config_api_key}
+
+        return Model(model_id=model_id, credentials=credentials, project_id=project_id)
+
+    def _get_openai_client(self) -> AsyncOpenAI:
+        if not self._openai_client:
+            self._openai_client = AsyncOpenAI(
+                base_url=f"{self._config.url}/openai/v1",
+                api_key=self._config.api_key,
+            )
+        return self._openai_client
+
+    async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
+        params = await self._get_params(request)
+        r = self._get_client(request.model).generate(**params)
+        choices = []
+        if "results" in r:
+            for result in r["results"]:
+                choice = OpenAICompatCompletionChoice(
+                    finish_reason=result["stop_reason"] if result["stop_reason"] else None,
+                    text=result["generated_text"],
+                )
+                choices.append(choice)
+        response = OpenAICompatCompletionResponse(
+            choices=choices,
+        )
+        return process_completion_response(response)
+
+    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
+        params = await self._get_params(request)
+
+        async def _generate_and_convert_to_openai_compat():
+            s = self._get_client(request.model).generate_text_stream(**params)
+            for chunk in s:
+                choice = OpenAICompatCompletionChoice(
+                    finish_reason=None,
+                    text=chunk,
+                )
+                yield OpenAICompatCompletionResponse(
+                    choices=[choice],
+                )
+
+        stream = _generate_and_convert_to_openai_compat()
+        async for chunk in process_completion_stream_response(stream):
+            yield chunk
+
+    async def chat_completion(
+        self,
+        model_id: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> AsyncGenerator:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        model = await self.model_store.get_model(model_id)
+        request = ChatCompletionRequest(
+            model=model.provider_resource_id,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            response_format=response_format,
+            stream=stream,
+            logprobs=logprobs,
+            tool_config=tool_config,
+        )
+
+        if stream:
+            return self._stream_chat_completion(request)
+        else:
+            return await self._nonstream_chat_completion(request)
+
+    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
+        params = await self._get_params(request)
+        r = self._get_client(request.model).generate(**params)
+        choices = []
+        if "results" in r:
+            for result in r["results"]:
+                choice = OpenAICompatCompletionChoice(
+                    finish_reason=result["stop_reason"] if result["stop_reason"] else None,
+                    text=result["generated_text"],
+                )
+                choices.append(choice)
+        response = OpenAICompatCompletionResponse(
+            choices=choices,
+        )
+        return process_chat_completion_response(response, request)
+
+    async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
+        params = await self._get_params(request)
+        model_id = request.model
+
+        # if we shift to TogetherAsyncClient, we won't need this wrapper
+        async def _to_async_generator():
+            s = self._get_client(model_id).generate_text_stream(**params)
+            for chunk in s:
+                choice = OpenAICompatCompletionChoice(
+                    finish_reason=None,
+                    text=chunk,
+                )
+                yield OpenAICompatCompletionResponse(
+                    choices=[choice],
+                )
+
+        stream = _to_async_generator()
+        async for chunk in process_chat_completion_stream_response(stream, request):
+            yield chunk
+
+    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+        input_dict = {"params": {}}
+        media_present = request_has_media(request)
+        llama_model = self.get_llama_model(request.model)
+        if isinstance(request, ChatCompletionRequest):
+            input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
+        else:
+            assert not media_present, "Together does not support media for Completion requests"
+            input_dict["prompt"] = await completion_request_to_prompt(request)
+        if request.sampling_params:
+            if request.sampling_params.strategy:
+                input_dict["params"][GenParams.DECODING_METHOD] = request.sampling_params.strategy.type
+            if request.sampling_params.max_tokens:
+                input_dict["params"][GenParams.MAX_NEW_TOKENS] = request.sampling_params.max_tokens
+            if request.sampling_params.repetition_penalty:
+                input_dict["params"][GenParams.REPETITION_PENALTY] = request.sampling_params.repetition_penalty
+
+            if isinstance(request.sampling_params.strategy, TopPSamplingStrategy):
+                input_dict["params"][GenParams.TOP_P] = request.sampling_params.strategy.top_p
+                input_dict["params"][GenParams.TEMPERATURE] = request.sampling_params.strategy.temperature
+            if isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
+                input_dict["params"][GenParams.TOP_K] = request.sampling_params.strategy.top_k
+            if isinstance(request.sampling_params.strategy, GreedySamplingStrategy):
+                input_dict["params"][GenParams.TEMPERATURE] = 0.0
+
+        input_dict["params"][GenParams.STOP_SEQUENCES] = ["<|endoftext|>"]
+
+        params = {
+            **input_dict,
+        }
+        return params
+
+    async def embeddings(
+        self,
+        model_id: str,
+        contents: List[str] | List[InterleavedContentItem],
+        text_truncation: Optional[TextTruncation] = TextTruncation.none,
+        output_dimension: Optional[int] = None,
+        task_type: Optional[EmbeddingTaskType] = None,
+    ) -> EmbeddingsResponse:
+        raise NotImplementedError("embedding is not supported for watsonx")
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._get_openai_client().completions.create(**params)  # type: ignore
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        if params.get("stream", False):
+            return self._stream_openai_chat_completion(params)
+        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
+
+    async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
+        # watsonx.ai sometimes adds usage data to the stream
+        include_usage = False
+        if params.get("stream_options", None):
+            include_usage = params["stream_options"].get("include_usage", False)
+        stream = await self._get_openai_client().chat.completions.create(**params)
+
+        seen_finish_reason = False
+        async for chunk in stream:
+            # Final usage chunk with no choices that the user didn't request, so discard
+            if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
+                break
+            yield chunk
+            for choice in chunk.choices:
+                if choice.finish_reason:
+                    seen_finish_reason = True
+                    break
--- a/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/llama_stack/providers/remote/post_training/nvidia/README.md
@ -36,7 +36,6 @@ import os

 os.environ["NVIDIA_API_KEY"] = "your-api-key"
 os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
-os.environ["NVIDIA_USER_ID"] = "llama-stack-user"
 os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
 os.environ["NVIDIA_PROJECT_ID"] = "test-project"
 os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = "test-example-model@v1"
@ -128,13 +127,14 @@ client.post_training.job.cancel(job_uuid="your-job-id")
 #### 1. Register the model

 ```python
-model = Model(
-    identifier="test-example-model@v1",
+from llama_stack.apis.models import Model, ModelType
+
+client.models.register(
+    model_id="test-example-model@v1",
    provider_id="nvidia",
    provider_model_id="test-example-model@v1",
    model_type=ModelType.llm,
 )
-client.register_model(model)
 ```

 #### 2. Inference with the fine-tuned model
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta/llama-3.1-8b-instruct",
        CoreModelId.llama3_1_8b_instruct.value,
-    )
+    ),
+    build_hf_repo_model_entry(
+        "meta/llama-3.2-1b-instruct",
+        CoreModelId.llama3_2_1b_instruct.value,
+    ),
 ]


--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -67,13 +67,18 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        self.timeout = aiohttp.ClientTimeout(total=config.timeout)
        # TODO: filter by available models based on /config endpoint
        ModelRegistryHelper.__init__(self, model_entries=_MODEL_ENTRIES)
-        self.session = aiohttp.ClientSession(headers=self.headers, timeout=self.timeout)
-        self.customizer_url = config.customizer_url
+        self.session = None

+        self.customizer_url = config.customizer_url
        if not self.customizer_url:
            warnings.warn("Customizer URL is not set, using default value: http://nemo.test", stacklevel=2)
            self.customizer_url = "http://nemo.test"

+    async def _get_session(self) -> aiohttp.ClientSession:
+        if self.session is None or self.session.closed:
+            self.session = aiohttp.ClientSession(headers=self.headers, timeout=self.timeout)
+        return self.session
+
    async def _make_request(
        self,
        method: str,
@ -94,11 +99,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        if json and "Content-Type" not in request_headers:
            request_headers["Content-Type"] = "application/json"

+        session = await self._get_session()
        for _ in range(self.config.max_retries):
-            # TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
-            async with self.session.request(
-                method, url, params=params, json=json, verify_ssl=False, **kwargs
-            ) as response:
+            async with session.request(method, url, params=params, json=json, **kwargs) as response:
                if response.status >= 400:
                    error_data = await response.json()
                    raise Exception(f"API request failed: {error_data}")
@ -125,8 +128,8 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        jobs = []
        for job in response.get("data", []):
            job_id = job.pop("id")
-            job_status = job.pop("status", "unknown").lower()
-            mapped_status = STATUS_MAPPING.get(job_status, "unknown")
+            job_status = job.pop("status", "scheduled").lower()
+            mapped_status = STATUS_MAPPING.get(job_status, "scheduled")

            # Convert string timestamps to datetime objects
            created_at = (
@ -180,7 +183,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        )

        api_status = response.pop("status").lower()
-        mapped_status = STATUS_MAPPING.get(api_status, "unknown")
+        mapped_status = STATUS_MAPPING.get(api_status, "scheduled")

        return NvidiaPostTrainingJobStatusResponse(
            status=JobStatus(mapped_status),
@ -242,6 +245,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):

        Supported models:
            - meta/llama-3.1-8b-instruct
+            - meta/llama-3.2-1b-instruct

        Supported algorithm configs:
            - LoRA, SFT
@ -287,10 +291,6 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):

            - LoRA config:
                ## NeMo customizer specific LoRA parameters
-                - adapter_dim: int - Adapter dimension
-                    Default: 8 (supports powers of 2)
-                - adapter_dropout: float - Adapter dropout
-                    Default: None (0.0-1.0)
                - alpha: int - Scaling factor for the LoRA update
                    Default: 16
            Note:
@ -300,7 +300,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
            User is informed about unsupported parameters via warnings.
        """
        # Map model to nvidia model name
-        # ToDo: only supports llama-3.1-8b-instruct now, need to update this to support other models
+        # See `_MODEL_ENTRIES` for supported models
        nvidia_model = self.get_provider_model_id(model)

        # Check for unsupported method parameters
@ -333,7 +333,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
            },
            "data_config": {"dataset_id", "batch_size"},
            "optimizer_config": {"lr", "weight_decay"},
-            "lora_config": {"type", "adapter_dim", "adapter_dropout", "alpha"},
+            "lora_config": {"type", "alpha"},
        }

        # Validate all parameters at once
@ -392,17 +392,10 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):

        # Handle LoRA-specific configuration
        if algorithm_config:
-            algorithm_config_dict = algorithm_config.model_dump()
-            if algorithm_config_dict.get("type") == "LoRA":
-                warn_unsupported_params(algorithm_config_dict, supported_params["lora_config"], "LoRA config")
+            if algorithm_config.type == "LoRA":
+                warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config")
                job_config["hyperparameters"]["lora"] = {
-                    k: v
-                    for k, v in {
-                        "adapter_dim": algorithm_config_dict.get("adapter_dim"),
-                        "alpha": algorithm_config_dict.get("alpha"),
-                        "adapter_dropout": algorithm_config_dict.get("adapter_dropout"),
-                    }.items()
-                    if v is not None
+                    k: v for k, v in {"alpha": algorithm_config.alpha}.items() if v is not None
                }
            else:
                raise NotImplementedError(f"Unsupported algorithm config: {algorithm_config}")
--- a/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/llama_stack/providers/remote/safety/nvidia/README.md
@ -0,0 +1,77 @@
+# NVIDIA Safety Provider for LlamaStack
+
+This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
+
+## Features
+
+- Run safety checks for messages
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to NVIDIA NeMo Guardrails service
+- NIM for model to use for safety check is deployed
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = "your-api-key"
+os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+#### Create a safety shield
+
+```python
+from llama_stack.apis.safety import Shield
+from llama_stack.apis.inference import Message
+
+# Create a safety shield
+shield = Shield(
+    shield_id="your-shield-id",
+    provider_resource_id="safety-model-id",  # The model to use for safety checks
+    description="Safety checks for content moderation",
+)
+
+# Register the shield
+await client.safety.register_shield(shield)
+```
+
+#### Run safety checks
+
+```python
+# Messages to check
+messages = [Message(role="user", content="Your message to check")]
+
+# Run safety check
+response = await client.safety.run_shield(
+    shield_id="your-shield-id",
+    messages=messages,
+)
+
+# Check for violations
+if response.violation:
+    print(f"Safety violation detected: {response.violation.user_message}")
+    print(f"Violation level: {response.violation.violation_level}")
+    print(f"Metadata: {response.violation.metadata}")
+else:
+    print("No safety violations detected")
+```
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -8,7 +8,17 @@ import logging
 import time
 import uuid
 import warnings
-from typing import Any, AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, List, Optional, Union
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Awaitable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Union,
+)

 from openai import AsyncStream
 from openai.types.chat import (
@ -78,6 +88,7 @@ from llama_stack.apis.common.content_types import (
    TextDelta,
    ToolCallDelta,
    ToolCallParseStatus,
+    _URLOrData,
 )
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
@ -93,6 +104,7 @@ from llama_stack.apis.inference import (
    SamplingParams,
    SystemMessage,
    TokenLogProbs,
+    ToolChoice,
    ToolResponseMessage,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
@ -103,7 +115,6 @@ from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
    OpenAICompletion,
    OpenAICompletionChoice,
-    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ToolConfig,
 )
@ -513,11 +524,26 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
    else:
        content = [await _convert_content(message.content)]

-    return {
+    result = {
        "role": message.role,
        "content": content,
    }

+    if hasattr(message, "tool_calls") and message.tool_calls:
+        result["tool_calls"] = []
+        for tc in message.tool_calls:
+            result["tool_calls"].append(
+                {
+                    "id": tc.call_id,
+                    "type": "function",
+                    "function": {
+                        "name": tc.tool_name,
+                        "arguments": tc.arguments_json if hasattr(tc, "arguments_json") else json.dumps(tc.arguments),
+                    },
+                }
+            )
+    return result
+

 class UnparseableToolCall(BaseModel):
    """
@ -612,13 +638,10 @@ async def convert_message_to_openai_dict_new(
            )
            for tool in message.tool_calls
        ]
-        params = {}
-        if tool_calls:
-            params = {"tool_calls": tool_calls}
        out = OpenAIChatCompletionAssistantMessage(
            role="assistant",
            content=await _convert_message_content(message.content),
-            **params,
+            tool_calls=tool_calls or None,
        )
    elif isinstance(message, ToolResponseMessage):
        out = OpenAIChatCompletionToolMessage(
@ -695,7 +718,10 @@ def to_openai_param_type(param_type: str) -> dict:
    if param_type.startswith("list[") and param_type.endswith("]"):
        inner_type = param_type[5:-1]
        if inner_type in basic_types:
-            return {"type": "array", "items": {"type": basic_types.get(inner_type, inner_type)}}
+            return {
+                "type": "array",
+                "items": {"type": basic_types.get(inner_type, inner_type)},
+            }

    return {"type": param_type}

@ -815,6 +841,10 @@ def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
 def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[str, Any]]] = None) -> ToolConfig:
    tool_config = ToolConfig()
    if tool_choice:
+        try:
+            tool_choice = ToolChoice(tool_choice)
+        except ValueError:
+            pass
        tool_config.tool_choice = tool_choice
    return tool_config

@ -849,7 +879,9 @@ def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None)
    return lls_tools


-def _convert_openai_request_response_format(response_format: OpenAIResponseFormatParam = None):
+def _convert_openai_request_response_format(
+    response_format: OpenAIResponseFormatParam = None,
+):
    if not response_format:
        return None
    # response_format can be a dict or a pydantic model
@ -957,38 +989,50 @@ def _convert_openai_sampling_params(
    return sampling_params


-def _convert_openai_request_messages(messages: List[OpenAIMessageParam]):
-    # Llama Stack messages and OpenAI messages are similar, but not identical.
-    lls_messages = []
+def openai_messages_to_messages(
+    messages: List[OpenAIChatCompletionMessage],
+) -> List[Message]:
+    """
+    Convert a list of OpenAIChatCompletionMessage into a list of Message.
+    """
+    converted_messages = []
    for message in messages:
-        lls_message = dict(message)
+        if message.role == "system":
+            converted_message = SystemMessage(content=message.content)
+        elif message.role == "user":
+            converted_message = UserMessage(content=openai_content_to_content(message.content))
+        elif message.role == "assistant":
+            converted_message = CompletionMessage(
+                content=message.content,
+                tool_calls=_convert_openai_tool_calls(message.tool_calls),
+                stop_reason=StopReason.end_of_turn,
+            )
+        elif message.role == "tool":
+            converted_message = ToolResponseMessage(
+                role="tool",
+                call_id=message.tool_call_id,
+                content=openai_content_to_content(message.content),
+            )
+        else:
+            raise ValueError(f"Unknown role {message.role}")
+        converted_messages.append(converted_message)
+    return converted_messages

-        #  Llama Stack expects `call_id` but OpenAI uses `tool_call_id`
-        tool_call_id = lls_message.pop("tool_call_id", None)
-        if tool_call_id:
-            lls_message["call_id"] = tool_call_id

-        content = lls_message.get("content", None)
-        if isinstance(content, list):
-            lls_content = []
-            for item in content:
-                # items can either by pydantic models or dicts here...
-                item = dict(item)
-                if item.get("type", "") == "image_url":
-                    lls_item = ImageContentItem(
-                        type="image",
-                        image=URL(uri=item.get("image_url", {}).get("url", "")),
-                    )
-                elif item.get("type", "") == "text":
-                    lls_item = TextContentItem(
-                        type="text",
-                        text=item.get("text", ""),
-                    )
-                lls_content.append(lls_item)
-            lls_message["content"] = lls_content
-        lls_messages.append(lls_message)
-
-    return lls_messages
+def openai_content_to_content(content: Union[str, Iterable[OpenAIChatCompletionContentPartParam]]):
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, list):
+        return [openai_content_to_content(c) for c in content]
+    elif hasattr(content, "type"):
+        if content.type == "text":
+            return TextContentItem(type="text", text=content.text)
+        elif content.type == "image_url":
+            return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))
+        else:
+            raise ValueError(f"Unknown content type: {content.type}")
+    else:
+        raise ValueError(f"Unknown content type: {content}")


 def convert_openai_chat_completion_choice(
@ -1313,7 +1357,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
-        messages = _convert_openai_request_messages(messages)
+        messages = openai_messages_to_messages(messages)
        response_format = _convert_openai_request_response_format(response_format)
        sampling_params = _convert_openai_sampling_params(
            max_tokens=max_tokens,
@ -1321,7 +1365,10 @@ class OpenAIChatCompletionToLlamaStackMixin:
            top_p=top_p,
        )
        tool_config = _convert_openai_request_tool_config(tool_choice)
+
        tools = _convert_openai_request_tools(tools)
+        if tool_config.tool_choice == ToolChoice.none:
+            tools = []

        outstanding_responses = []
        # "n" is the number of completions to generate per prompt
@ -1346,7 +1393,9 @@ class OpenAIChatCompletionToLlamaStackMixin:
        )

    async def _process_stream_response(
-        self, model: str, outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]]
+        self,
+        model: str,
+        outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
    ):
        id = f"chatcmpl-{uuid.uuid4()}"
        for outstanding_response in outstanding_responses:
@ -1369,11 +1418,31 @@ class OpenAIChatCompletionToLlamaStackMixin:
                elif isinstance(event.delta, ToolCallDelta):
                    if event.delta.parse_status == ToolCallParseStatus.succeeded:
                        tool_call = event.delta.tool_call
+
+                        # First chunk includes full structure
                        openai_tool_call = OpenAIChoiceDeltaToolCall(
                            index=0,
                            id=tool_call.call_id,
                            function=OpenAIChoiceDeltaToolCallFunction(
-                                name=tool_call.tool_name, arguments=tool_call.arguments_json
+                                name=tool_call.tool_name,
+                                arguments="",
+                            ),
+                        )
+                        delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
+                        yield OpenAIChatCompletionChunk(
+                            id=id,
+                            choices=[
+                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
+                            ],
+                            created=int(time.time()),
+                            model=model,
+                            object="chat.completion.chunk",
+                        )
+                        # arguments
+                        openai_tool_call = OpenAIChoiceDeltaToolCall(
+                            index=0,
+                            function=OpenAIChoiceDeltaToolCallFunction(
+                                arguments=tool_call.arguments_json,
                            ),
                        )
                        delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -52,6 +52,9 @@ from llama_stack.models.llama.llama3.prompt_templates import (
    SystemDefaultGenerator,
 )
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
+)
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
 from llama_stack.providers.utils.inference import supported_inference_models
@ -306,10 +309,11 @@ def chat_completion_request_to_messages(
    elif model.model_family in (
        ModelFamily.llama3_2,
        ModelFamily.llama3_3,
-        ModelFamily.llama4,
    ):
-        # llama3.2, llama3.3 and llama4 models follow the same tool prompt format
-        messages = augment_messages_for_tools_llama_3_2(request)
+        # llama3.2, llama3.3 follow the same tool prompt format
+        messages = augment_messages_for_tools_llama(request, PythonListCustomToolGenerator)
+    elif model.model_family == ModelFamily.llama4:
+        messages = augment_messages_for_tools_llama(request, PythonListCustomToolGeneratorLlama4)
    else:
        messages = request.messages

@ -399,8 +403,9 @@ def augment_messages_for_tools_llama_3_1(
    return messages


-def augment_messages_for_tools_llama_3_2(
+def augment_messages_for_tools_llama(
    request: ChatCompletionRequest,
+    custom_tool_prompt_generator,
 ) -> List[Message]:
    existing_messages = request.messages
    existing_system_message = None
@ -434,7 +439,7 @@ def augment_messages_for_tools_llama_3_2(
        if existing_system_message and request.tool_config.system_message_behavior == SystemMessageBehavior.replace:
            system_prompt = existing_system_message.content

-        tool_template = PythonListCustomToolGenerator().gen(custom_tools, system_prompt)
+        tool_template = custom_tool_prompt_generator().gen(custom_tools, system_prompt)

        sys_content += tool_template.render()
        sys_content += "\n"
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@ -756,5 +756,41 @@
    "vllm",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "watsonx": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "datasets",
+    "emoji",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "ibm_watson_machine_learning",
+    "langdetect",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn"
  ]
 }
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@ -69,6 +69,7 @@ LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
+  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
@ -82,6 +83,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  --pull always \
+  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@ -25,14 +25,84 @@ The following models are available by default:
 {% endif %}


-### Prerequisite: API Keys
+## Prerequisites
+### NVIDIA API Keys

-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.

+### Deploy NeMo Microservices Platform
+The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
+
+## Supported Services
+Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
+
+### Inference: NVIDIA NIM
+NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
+  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
+  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
+
+The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
+
+### Datasetio API: NeMo Data Store
+The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
+
+See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
+
+### Eval API: NeMo Evaluator
+The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
+
+### Post-Training API: NeMo Customizer
+The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
+
+### Safety API: NeMo Guardrails
+The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the NVIDIA Safety docs for supported features and example usage.
+
+## Deploying models
+In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
+
+Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
+```sh
+# URL to NeMo NIM Proxy service
+export NEMO_URL="http://nemo.test"
+
+curl --location "$NEMO_URL/v1/deployment/model-deployments" \
+   -H 'accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "name": "llama-3.2-1b-instruct",
+      "namespace": "meta",
+      "config": {
+         "model": "meta/llama-3.2-1b-instruct",
+         "nim_deployment": {
+            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
+            "image_tag": "1.8.3",
+            "pvc_size": "25Gi",
+            "gpu": 1,
+            "additional_envs": {
+               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
+            }
+         }
+      }
+   }'
+```
+This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
+
+You can also remove a deployed NIM to free up GPU resources, if needed.
+```sh
+export NEMO_URL="http://nemo.test"
+
+curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
+```

 ## Running Llama Stack with NVIDIA

-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via Conda or venv (build code), or Docker which has a pre-built image.

 ### Via Docker

@ -54,9 +124,23 @@ docker run \
 ### Via Conda

 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+llama stack build --template nvidia --image-type venv
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -98,23 +98,15 @@ def get_distribution_template() -> DistributionTemplate:
                "",
                "NVIDIA API Key",
            ),
-            ## Nemo Customizer related variables
-            "NVIDIA_USER_ID": (
-                "llama-stack-user",
-                "NVIDIA User ID",
-            ),
            "NVIDIA_APPEND_API_VERSION": (
                "True",
                "Whether to append the API version to the base_url",
            ),
+            ## Nemo Customizer related variables
            "NVIDIA_DATASET_NAMESPACE": (
                "default",
                "NVIDIA Dataset Namespace",
            ),
-            "NVIDIA_ACCESS_POLICIES": (
-                "{}",
-                "NVIDIA Access Policies",
-            ),
            "NVIDIA_PROJECT_ID": (
                "test-project",
                "NVIDIA Project ID",
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -57,7 +57,7 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
+      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -52,7 +52,7 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
+      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia
@ -178,6 +178,16 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-90b-vision-instruct
  model_type: llm
+- metadata: {}
+  model_id: meta/llama-3.3-70b-instruct
+  provider_id: nvidia
+  provider_model_id: meta/llama-3.3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: nvidia
+  provider_model_id: meta/llama-3.3-70b-instruct
+  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -28,10 +28,10 @@ The following environment variables can be configured:

 ## Setting up vLLM server

-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
 server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
 [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
-that we only use GPUs here for demonstration purposes.
+that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.

 ### Setting up vLLM server on AMD GPU

@ -149,6 +149,55 @@ docker run \
    --port $SAFETY_PORT
 ```

+### Setting up vLLM server on Intel GPU
+
+Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
+- [intel/vllm](https://hub.docker.com/r/intel/vllm)
+
+Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
+
+```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ZE_AFFINITY_MASK=0
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $INFERENCE_MODEL \
+    --port $INFERENCE_PORT
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export ZE_AFFINITY_MASK=1
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $SAFETY_PORT:$SAFETY_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $SAFETY_MODEL \
+    --port $SAFETY_PORT
+```
+
 ## Running Llama Stack

 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
--- a/llama_stack/templates/watsonx/init.py
+++ b/llama_stack/templates/watsonx/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .watsonx import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/watsonx/build.yaml
+++ b/llama_stack/templates/watsonx/build.yaml
@ -0,0 +1,30 @@
+version: '2'
+distribution_spec:
+  description: Use watsonx for running LLM inference
+  providers:
+    inference:
+    - remote::watsonx
+    vector_io:
+    - inline::faiss
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::code-interpreter
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
--- a/llama_stack/templates/watsonx/doc_template.md
+++ b/llama_stack/templates/watsonx/doc_template.md
@ -0,0 +1,74 @@
+---
+orphan: true
+---
+# watsonx Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars  %}
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key).
+
+
+## Running Llama Stack with watsonx
+
+You can do this via Conda (build code), venv or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-{{ name }} \
+  --yaml-config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env WATSONX_API_KEY=$WATSONX_API_KEY \
+  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
+  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
+```
+
+### Via Conda
+
+```bash
+llama stack build --template watsonx --image-type conda
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env WATSONX_API_KEY=$WATSONX_API_KEY \
+  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
+```
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@ -0,0 +1,210 @@
+version: '2'
+image_name: watsonx
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: watsonx
+    provider_type: remote::watsonx
+    config:
+      url: ${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}
+      api_key: ${env.WATSONX_API_KEY:}
+      project_id: ${env.WATSONX_PROJECT_ID:}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/watsonx/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/registry.db
+models:
+- metadata: {}
+  model_id: meta-llama/llama-3-3-70b-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-2-13b-chat
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-2-13b-chat
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-2-13b
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-2-13b-chat
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-1-70b-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-1-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-70B-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-1-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-1-8b-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-2-11b-vision-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-11b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-11b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-2-1b-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-1b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-1B-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-1b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-2-3b-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-3b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-3b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-3-2-90b-vision-instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-90b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-3-2-90b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/llama-guard-3-11b-vision
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-guard-3-11b-vision
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: watsonx
+  provider_model_id: meta-llama/llama-guard-3-11b-vision
+  model_type: llm
+shields: []
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/watsonx/watsonx.py
+++ b/llama_stack/templates/watsonx/watsonx.py
@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import Provider, ToolGroupInput
+from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
+from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::watsonx"],
+        "vector_io": ["inline::faiss"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::code-interpreter",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+
+    inference_provider = Provider(
+        provider_id="watsonx",
+        provider_type="remote::watsonx",
+        config=WatsonXConfig.sample_run_config(),
+    )
+
+    available_models = {
+        "watsonx": MODEL_ENTRIES,
+    }
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::code_interpreter",
+            provider_id="code-interpreter",
+        ),
+    ]
+
+    default_models = get_model_registry(available_models)
+    return DistributionTemplate(
+        name="watsonx",
+        distro_type="remote_hosted",
+        description="Use watsonx for running LLM inference",
+        container_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        available_models_by_provider=available_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=default_models,
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "WATSONX_API_KEY": (
+                "",
+                "watsonx API Key",
+            ),
+            "WATSONX_PROJECT_ID": (
+                "",
+                "watsonx Project ID",
+            ),
+        },
+    )