Merge branch 'main' into add-watsonx-inference-adapter

2025-12-30 21:33:52 +00:00 · 2025-04-17 23:45:57 +05:30 · 2025-04-17 23:45:57 +05:30 · c407f3c340
commit c407f3c340
parent efe5b124f3 8bd6665775
28 changed files with 1786 additions and 1354 deletions
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -86,15 +86,15 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
        with:
          python-version: '3.10'

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
        with:
          python-version: "3.10"

--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -5221,17 +5221,25 @@
                        "default": 10
                    },
                    "model": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The model identifier to use for the agent"
                    },
                    "instructions": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The system instructions for the agent"
+                    },
+                    "name": {
+                        "type": "string",
+                        "description": "Optional name for the agent, used in telemetry and identification"
                    },
                    "enable_session_persistence": {
                        "type": "boolean",
-                        "default": false
+                        "default": false,
+                        "description": "Optional flag indicating whether session data has to be persisted"
                    },
                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "Optional response format configuration"
                    }
                },
                "additionalProperties": false,
@ -5239,7 +5247,8 @@
                    "model",
                    "instructions"
                ],
-                "title": "AgentConfig"
+                "title": "AgentConfig",
+                "description": "Configuration for an agent."
            },
            "AgentTool": {
                "oneOf": [
@ -8891,8 +8900,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "role",
-                    "content"
+                    "role"
                ],
                "title": "OpenAIAssistantMessageParam",
                "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -3686,18 +3686,29 @@ components:
          default: 10
        model:
          type: string
+          description: >-
+            The model identifier to use for the agent
        instructions:
          type: string
+          description: The system instructions for the agent
+        name:
+          type: string
+          description: >-
+            Optional name for the agent, used in telemetry and identification
        enable_session_persistence:
          type: boolean
          default: false
+          description: >-
+            Optional flag indicating whether session data has to be persisted
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
+          description: Optional response format configuration
      additionalProperties: false
      required:
        - model
        - instructions
      title: AgentConfig
+      description: Configuration for an agent.
    AgentTool:
      oneOf:
        - type: string
@ -6097,7 +6108,6 @@ components:
      additionalProperties: false
      required:
        - role
-        - content
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -1,88 +0,0 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# NVIDIA Distribution
-
-The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::nvidia` |
-| post_training | `remote::nvidia` |
-| safety | `remote::nvidia` |
-| scoring | `inline::basic` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `inline::rag-runtime` |
-| vector_io | `inline::faiss` |
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
-
-### Models
-
-The following models are available by default:
-
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
-
-
-## Running Llama Stack with NVIDIA
-
-You can do this via Conda (build code) or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=8321
-docker run \
-  -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-nvidia \
-  --yaml-config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
-```
-
-### Via Conda
-
-```bash
-llama stack build --template nvidia --image-type conda
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -45,20 +45,91 @@ The following models are available by default:
 - `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
 - `snowflake/arctic-embed-l `


-### Prerequisite: API Keys
+## Prerequisites
+### NVIDIA API Keys

-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.

+### Deploy NeMo Microservices Platform
+The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
+
+## Supported Services
+Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
+
+### Inference: NVIDIA NIM
+NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
+  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
+  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
+
+The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
+
+### Datasetio API: NeMo Data Store
+The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
+
+See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
+
+### Eval API: NeMo Evaluator
+The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
+
+### Post-Training API: NeMo Customizer
+The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
+
+### Safety API: NeMo Guardrails
+The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the NVIDIA Safety docs for supported features and example usage.
+
+## Deploying models
+In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
+
+Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
+```sh
+# URL to NeMo NIM Proxy service
+export NEMO_URL="http://nemo.test"
+
+curl --location "$NEMO_URL/v1/deployment/model-deployments" \
+   -H 'accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "name": "llama-3.2-1b-instruct",
+      "namespace": "meta",
+      "config": {
+         "model": "meta/llama-3.2-1b-instruct",
+         "nim_deployment": {
+            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
+            "image_tag": "1.8.3",
+            "pvc_size": "25Gi",
+            "gpu": 1,
+            "additional_envs": {
+               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
+            }
+         }
+      }
+   }'
+```
+This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
+
+You can also remove a deployed NIM to free up GPU resources, if needed.
+```sh
+export NEMO_URL="http://nemo.test"
+
+curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
+```

 ## Running Llama Stack with NVIDIA

-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via Conda or venv (build code), or Docker which has a pre-built image.

 ### Via Docker

@ -80,9 +151,27 @@ docker run \
 ### Via Conda

 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+llama stack build --template nvidia --image-type venv
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
+
+### Example Notebooks
+You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
+- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):

@json_schema_type
 class AgentConfig(AgentConfigCommon):
+    """Configuration for an agent.
+
+    :param model: The model identifier to use for the agent
+    :param instructions: The system instructions for the agent
+    :param name: Optional name for the agent, used in telemetry and identification
+    :param enable_session_persistence: Optional flag indicating whether session data has to be persisted
+    :param response_format: Optional response format configuration
+    """
+
    model: str
    instructions: str
+    name: Optional[str] = None
    enable_session_persistence: Optional[bool] = False
    response_format: Optional[ResponseFormat] = None

--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
    """

    role: Literal["assistant"] = "assistant"
-    content: OpenAIChatCompletionMessageContent
+    content: Optional[OpenAIChatCompletionMessageContent] = None
    name: Optional[str] = None
-    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
+    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None


@json_schema_type
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -235,10 +235,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        )

    except (Exception, RuntimeError) as exc:
+        import traceback
+
        cprint(
            f"Error building stack: {exc}",
            color="red",
        )
+        cprint("Stack trace:", color="red")
+        traceback.print_exc()
        sys.exit(1)
    if run_config is None:
        cprint(
@ -350,7 +354,7 @@ def _run_stack_build_command_from_build_config(
        build_config,
        build_file_path,
        image_name,
-        template_or_config=template_name or config_path,
+        template_or_config=template_name or config_path or str(build_file_path),
    )
    if return_code != 0:
        raise RuntimeError(f"Failed to build image {image_name}")
--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@ -37,6 +37,17 @@ def tool_chat_page():
            label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
        )

+        if "builtin::rag" in toolgroup_selection:
+            vector_dbs = llama_stack_api.client.vector_dbs.list() or []
+            if not vector_dbs:
+                st.info("No vector databases available for selection.")
+            vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
+            selected_vector_dbs = st.multiselect(
+                label="Select Document Collections to use in RAG queries",
+                options=vector_dbs,
+                on_change=reset_agent,
+            )
+
        st.subheader("MCP Servers")
        mcp_selection = st.pills(
            label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
@ -67,6 +78,16 @@ def tool_chat_page():
            on_change=reset_agent,
        )

+    for i, tool_name in enumerate(toolgroup_selection):
+        if tool_name == "builtin::rag":
+            tool_dict = dict(
+                name="builtin::rag",
+                args={
+                    "vector_db_ids": list(selected_vector_dbs),
+                },
+            )
+            toolgroup_selection[i] = tool_dict
+
    @st.cache_resource
    def create_agent():
        return Agent(
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
            span.set_attribute("request", request.model_dump_json())
            turn_id = str(uuid.uuid4())
            span.set_attribute("turn_id", turn_id)
+            if self.agent_config.name:
+                span.set_attribute("agent_name", self.agent_config.name)

        await self._initialize_tools(request.toolgroups)
        async for chunk in self._run_turn(request, turn_id):
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
            span.set_attribute("session_id", request.session_id)
            span.set_attribute("request", request.model_dump_json())
            span.set_attribute("turn_id", request.turn_id)
+            if self.agent_config.name:
+                span.set_attribute("agent_name", self.agent_config.name)

        await self._initialize_tools()
        async for chunk in self._run_turn(request):
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
            stop_reason = None

            async with tracing.span("inference") as span:
+                if self.agent_config.name:
+                    span.set_attribute("agent_name", self.agent_config.name)
                async for chunk in await self.inference_api.chat_completion(
                    self.agent_config.model,
                    input_messages,
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -0,0 +1,85 @@
+# NVIDIA Inference Provider for LlamaStack
+
+This provider enables running inference using NVIDIA NIM.
+
+## Features
+- Endpoints for completions, chat completions, and embeddings for registered models
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to NVIDIA NIM deployment
+- NIM for model to use for inference is deployed
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = (
+    ""  # Required if using hosted NIM endpoint. If self-hosted, not required.
+)
+os.environ["NVIDIA_BASE_URL"] = "http://nim.test"  # NIM URL
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+### Create Completion
+
+```python
+response = client.completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    content="Complete the sentence using one word: Roses are red, violets are :",
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.content}")
+```
+
+### Create Chat Completion
+
+```python
+response = client.chat_completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    messages=[
+        {
+            "role": "system",
+            "content": "You must respond to each message with only one word",
+        },
+        {
+            "role": "user",
+            "content": "Complete the sentence using one word: Roses are red, violets are:",
+        },
+    ],
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.completion_message.content}")
+```
+
+### Create Embeddings
+```python
+response = client.embeddings(
+    model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
+)
+print(f"Embeddings: {response.embeddings}")
+```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.2-90b-vision-instruct",
        CoreModelId.llama3_2_90b_vision_instruct.value,
    ),
+    build_hf_repo_model_entry(
+        "meta/llama-3.3-70b-instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES

 # Map API status to JobStatus enum
 STATUS_MAPPING = {
-    "running": "in_progress",
-    "completed": "completed",
-    "failed": "failed",
-    "cancelled": "cancelled",
-    "pending": "scheduled",
+    "running": JobStatus.in_progress.value,
+    "completed": JobStatus.completed.value,
+    "failed": JobStatus.failed.value,
+    "cancelled": JobStatus.cancelled.value,
+    "pending": JobStatus.scheduled.value,
+    "unknown": JobStatus.scheduled.value,
 }


--- a/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/llama_stack/providers/remote/safety/nvidia/README.md
@ -0,0 +1,77 @@
+# NVIDIA Safety Provider for LlamaStack
+
+This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
+
+## Features
+
+- Run safety checks for messages
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to NVIDIA NeMo Guardrails service
+- NIM for model to use for safety check is deployed
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = "your-api-key"
+os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+#### Create a safety shield
+
+```python
+from llama_stack.apis.safety import Shield
+from llama_stack.apis.inference import Message
+
+# Create a safety shield
+shield = Shield(
+    shield_id="your-shield-id",
+    provider_resource_id="safety-model-id",  # The model to use for safety checks
+    description="Safety checks for content moderation",
+)
+
+# Register the shield
+await client.safety.register_shield(shield)
+```
+
+#### Run safety checks
+
+```python
+# Messages to check
+messages = [Message(role="user", content="Your message to check")]
+
+# Run safety check
+response = await client.safety.run_shield(
+    shield_id="your-shield-id",
+    messages=messages,
+)
+
+# Check for violations
+if response.violation:
+    print(f"Safety violation detected: {response.violation.user_message}")
+    print(f"Violation level: {response.violation.violation_level}")
+    print(f"Metadata: {response.violation.metadata}")
+else:
+    print("No safety violations detected")
+```
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@ -25,14 +25,84 @@ The following models are available by default:
 {% endif %}


-### Prerequisite: API Keys
+## Prerequisites
+### NVIDIA API Keys

-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.

+### Deploy NeMo Microservices Platform
+The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
+
+## Supported Services
+Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
+
+### Inference: NVIDIA NIM
+NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
+  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
+  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
+
+The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
+
+### Datasetio API: NeMo Data Store
+The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
+
+See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
+
+### Eval API: NeMo Evaluator
+The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
+
+### Post-Training API: NeMo Customizer
+The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
+
+### Safety API: NeMo Guardrails
+The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
+
+See the NVIDIA Safety docs for supported features and example usage.
+
+## Deploying models
+In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
+
+Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
+```sh
+# URL to NeMo NIM Proxy service
+export NEMO_URL="http://nemo.test"
+
+curl --location "$NEMO_URL/v1/deployment/model-deployments" \
+   -H 'accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "name": "llama-3.2-1b-instruct",
+      "namespace": "meta",
+      "config": {
+         "model": "meta/llama-3.2-1b-instruct",
+         "nim_deployment": {
+            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
+            "image_tag": "1.8.3",
+            "pvc_size": "25Gi",
+            "gpu": 1,
+            "additional_envs": {
+               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
+            }
+         }
+      }
+   }'
+```
+This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
+
+You can also remove a deployed NIM to free up GPU resources, if needed.
+```sh
+export NEMO_URL="http://nemo.test"
+
+curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
+```

 ## Running Llama Stack with NVIDIA

-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via Conda or venv (build code), or Docker which has a pre-built image.

 ### Via Docker

@ -54,9 +124,27 @@ docker run \
 ### Via Conda

 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+llama stack build --template nvidia --image-type venv
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
+
+### Example Notebooks
+You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
+- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -59,7 +59,7 @@ def get_distribution_template() -> DistributionTemplate:
    default_models = get_model_registry(available_models)
    return DistributionTemplate(
        name="nvidia",
-        distro_type="remote_hosted",
+        distro_type="self_hosted",
        description="Use NVIDIA NIM for running LLM inference and safety",
        container_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -173,6 +173,16 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama-3.2-90b-vision-instruct
  model_type: llm
+- metadata: {}
+  model_id: meta/llama-3.3-70b-instruct
+  provider_id: nvidia
+  provider_model_id: meta/llama-3.3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: nvidia
+  provider_model_id: meta/llama-3.3-70b-instruct
+  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,6 +46,7 @@ dev = [
    "pytest-asyncio",
    "pytest-cov",
    "pytest-html",
+    "pytest-json-report",
    "nbval",            # For notebook testing
    "black",
    "ruff",
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
        assert "I can't" in logs_str


+def test_agent_name(llama_stack_client, text_model_id):
+    agent_name = f"test-agent-{uuid4()}"
+
+    try:
+        agent = Agent(
+            llama_stack_client,
+            model=text_model_id,
+            instructions="You are a helpful assistant",
+            name=agent_name,
+        )
+    except TypeError:
+        agent = Agent(
+            llama_stack_client,
+            model=text_model_id,
+            instructions="You are a helpful assistant",
+        )
+        return
+
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Give me a sentence that contains the word: hello",
+            }
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    all_spans = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "session_id", "op": "eq", "value": session_id},
+        ],
+        attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
+    ):
+        all_spans.append(span.attributes)
+
+    agent_name_spans = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[],
+        attributes_to_return=["agent_name"],
+    ):
+        if "agent_name" in span.attributes:
+            agent_name_spans.append(span.attributes)
+
+    agent_logs = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "agent_name", "op": "eq", "value": agent_name},
+        ],
+        attributes_to_return=["input", "output", "agent_name"],
+    ):
+        if "output" in span.attributes and span.attributes["output"] != "no shields":
+            agent_logs.append(span.attributes)
+
+    assert len(agent_logs) == 1
+    assert agent_logs[0]["agent_name"] == agent_name
+    assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
+    assert "hello" in agent_logs[0]["output"].lower()
+
+
 def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
    common_params = dict(
        model="meta-llama/Llama-3.2-3B-Instruct",
--- a/tests/unit/distribution/test_build_path.py
+++ b/tests/unit/distribution/test_build_path.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.cli.stack._build import (
+    _run_stack_build_command_from_build_config,
+)
+from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
+from llama_stack.distribution.utils.image_types import LlamaStackImageType
+
+
+def test_container_build_passes_path(monkeypatch, tmp_path):
+    called_with = {}
+
+    def spy_build_image(cfg, build_file_path, image_name, template_or_config):
+        called_with["path"] = template_or_config
+        return 0
+
+    monkeypatch.setattr(
+        "llama_stack.cli.stack._build.build_image",
+        spy_build_image,
+        raising=True,
+    )
+
+    cfg = BuildConfig(
+        image_type=LlamaStackImageType.CONTAINER.value,
+        distribution_spec=DistributionSpec(providers={}, description=""),
+    )
+
+    _run_stack_build_command_from_build_config(cfg, image_name="dummy")
+
+    assert "path" in called_with
+    assert isinstance(called_with["path"], str)
+    assert Path(called_with["path"]).exists()
--- a/tests/unit/providers/nvidia/test_supervised_fine_tuning.py
+++ b/tests/unit/providers/nvidia/test_supervised_fine_tuning.py
@ -200,35 +200,48 @@ class TestNvidiaPostTraining(unittest.TestCase):
            )

    def test_get_training_job_status(self):
-        self.mock_make_request.return_value = {
-            "created_at": "2024-12-09T04:06:28.580220",
-            "updated_at": "2024-12-09T04:21:19.852832",
-            "status": "completed",
-            "steps_completed": 1210,
-            "epochs_completed": 2,
-            "percentage_done": 100.0,
-            "best_epoch": 2,
-            "train_loss": 1.718016266822815,
-            "val_loss": 1.8661999702453613,
-        }
+        customizer_status_to_job_status = [
+            ("running", "in_progress"),
+            ("completed", "completed"),
+            ("failed", "failed"),
+            ("cancelled", "cancelled"),
+            ("pending", "scheduled"),
+            ("unknown", "scheduled"),
+        ]

-        job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
+        for customizer_status, expected_status in customizer_status_to_job_status:
+            with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
+                self.mock_make_request.return_value = {
+                    "created_at": "2024-12-09T04:06:28.580220",
+                    "updated_at": "2024-12-09T04:21:19.852832",
+                    "status": customizer_status,
+                    "steps_completed": 1210,
+                    "epochs_completed": 2,
+                    "percentage_done": 100.0,
+                    "best_epoch": 2,
+                    "train_loss": 1.718016266822815,
+                    "val_loss": 1.8661999702453613,
+                }

-        status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
+                job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"

-        assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
-        assert status.status.value == "completed"
-        assert status.steps_completed == 1210
-        assert status.epochs_completed == 2
-        assert status.percentage_done == 100.0
-        assert status.best_epoch == 2
-        assert status.train_loss == 1.718016266822815
-        assert status.val_loss == 1.8661999702453613
+                status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))

-        self.mock_make_request.assert_called_once()
-        self._assert_request(
-            self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id}
-        )
+                assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
+                assert status.status.value == expected_status
+                assert status.steps_completed == 1210
+                assert status.epochs_completed == 2
+                assert status.percentage_done == 100.0
+                assert status.best_epoch == 2
+                assert status.train_loss == 1.718016266822815
+                assert status.val_loss == 1.8661999702453613
+
+                self._assert_request(
+                    self.mock_make_request,
+                    "GET",
+                    f"/v1/customization/jobs/{job_id}/status",
+                    expected_params={"job_id": job_id},
+                )

    def test_get_training_jobs(self):
        job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
--- a/tests/verifications/README.md
+++ b/tests/verifications/README.md
@ -8,29 +8,44 @@ This framework allows you to run the same set of verification tests against diff

 ## Features

-The verification suite currently tests:
+The verification suite currently tests the following in both streaming and non-streaming modes:

- Basic chat completions (streaming and non-streaming)
+- Basic chat completions
 - Image input capabilities
 - Structured JSON output formatting
 - Tool calling functionality

+## Report
+
+The lastest report can be found at [REPORT.md](REPORT.md).
+
+To update the report, ensure you have the API keys set,
+```bash
+export OPENAI_API_KEY=<your_openai_api_key>
+export FIREWORKS_API_KEY=<your_fireworks_api_key>
+export TOGETHER_API_KEY=<your_together_api_key>
+```
+then run
+```bash
+uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
+```
+
 ## Running Tests

 To run the verification tests, use pytest with the following parameters:

 ```bash
 cd llama-stack
-pytest tests/verifications/openai --provider=<provider-name>
+pytest tests/verifications/openai_api --provider=<provider-name>
 ```

 Example:
 ```bash
 # Run all tests
-pytest tests/verifications/openai --provider=together
+pytest tests/verifications/openai_api --provider=together

 # Only run tests with Llama 4 models
-pytest tests/verifications/openai --provider=together -k 'Llama-4'
+pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
 ```

 ### Parameters
@ -41,23 +56,22 @@ pytest tests/verifications/openai --provider=together -k 'Llama-4'

 ## Supported Providers

-The verification suite currently supports:
- OpenAI
- Fireworks
- Together
- Groq
- Cerebras
+The verification suite supports any provider with an OpenAI compatible endpoint.
+
+See `tests/verifications/conf/` for the list of supported providers.
+
+To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.

 ## Adding New Test Cases

-To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns.
+To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.


 ## Structure

 - `__init__.py` - Marks the directory as a Python package
- `conftest.py` - Global pytest configuration and fixtures
- `openai/` - Tests specific to OpenAI-compatible APIs
+- `conf/` - Provider-specific configuration files
+- `openai_api/` - Tests specific to OpenAI-compatible APIs
  - `fixtures/` - Test fixtures and utilities
    - `fixtures.py` - Provider-specific fixtures
    - `load.py` - Utilities for loading test cases
--- a/tests/verifications/REPORT.md
+++ b/tests/verifications/REPORT.md
@ -1,6 +1,6 @@
 # Test Results Report

-*Generated on: 2025-04-14 18:11:37*
+*Generated on: 2025-04-16 15:10:57*

 *This report was generated by running `python tests/verifications/generate_report.py`*

@ -15,7 +15,7 @@

 | Provider | Pass Rate | Tests Passed | Total Tests |
 | --- | --- | --- | --- |
-| Together | 48.7% | 37 | 76 |
+| Together | 51.3% | 39 | 76 |
 | Fireworks | 47.4% | 36 | 76 |
 | Openai | 100.0% | 52 | 52 |

@ -23,7 +23,7 @@

 ## Together

-*Tests run on: 2025-04-14 18:08:14*
+*Tests run on: 2025-04-16 15:03:51*

 ```bash
 # Run all tests for this provider:
@ -49,8 +49,8 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
 | test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
 | test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
+| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
 | test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
@ -74,7 +74,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe

 ## Fireworks

-*Tests run on: 2025-04-14 18:04:06*
+*Tests run on: 2025-04-16 15:05:54*

 ```bash
 # Run all tests for this provider:
@ -125,7 +125,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor

 ## Openai

-*Tests run on: 2025-04-14 18:09:51*
+*Tests run on: 2025-04-16 15:09:18*

 ```bash
 # Run all tests for this provider:
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@ -3,14 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "pytest-json-report",
-#     "pyyaml",
-# ]
-# ///
 """
 Test Report Generator

@ -67,16 +59,10 @@ RESULTS_DIR.mkdir(exist_ok=True)
 # Maximum number of test result files to keep per provider
 MAX_RESULTS_PER_PROVIDER = 1

-PROVIDER_ORDER = [
+DEFAULT_PROVIDERS = [
    "together",
    "fireworks",
-    "groq",
-    "cerebras",
    "openai",
-    "together-llama-stack",
-    "fireworks-llama-stack",
-    "groq-llama-stack",
-    "openai-llama-stack",
 ]

 VERIFICATION_CONFIG = _load_all_verification_configs()
@ -142,6 +128,14 @@ def run_tests(provider, keyword=None):
        return None


+def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
+    """Runs tests for a list of providers."""
+    print(f"Running tests for providers: {', '.join(providers_to_run)}")
+    for provider in providers_to_run:
+        run_tests(provider.strip(), keyword=keyword)
+    print("Finished running tests.")
+
+
 def parse_results(
    result_file,
 ) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
@ -250,20 +244,6 @@ def parse_results(
    return parsed_results, providers_in_file, tests_in_file, run_timestamp_str


-def get_all_result_files_by_provider():
-    """Get all test result files, keyed by provider."""
-    provider_results = {}
-
-    result_files = list(RESULTS_DIR.glob("*.json"))
-
-    for file in result_files:
-        provider = file.stem
-        if provider:
-            provider_results[provider] = file
-
-    return provider_results
-
-
 def generate_report(
    results_dict: Dict[str, Any],
    providers: Dict[str, Set[str]],
@ -276,6 +256,7 @@ def generate_report(
    Args:
        results_dict: Aggregated results [provider][model][test_name] -> status.
        providers: Dict of all providers and their models {provider: {models}}.
+                   The order of keys in this dict determines the report order.
        all_tests: Set of all test names found.
        provider_timestamps: Dict of provider to timestamp when tests were run
        output_file: Optional path to save the report.
@ -353,22 +334,17 @@ def generate_report(
                                passed_tests += 1
        provider_totals[provider] = (provider_passed, provider_total)

-    # Add summary table (use passed-in providers dict)
+    # Add summary table (use the order from the providers dict keys)
    report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
    report.append("| --- | --- | --- | --- |")
-    for provider in [p for p in PROVIDER_ORDER if p in providers]:  # Check against keys of passed-in dict
-        passed, total = provider_totals.get(provider, (0, 0))
-        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
-        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
-    for provider in [p for p in providers if p not in PROVIDER_ORDER]:  # Check against keys of passed-in dict
+    # Iterate through providers in the order they appear in the input dict
+    for provider in providers_sorted.keys():
        passed, total = provider_totals.get(provider, (0, 0))
        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
    report.append("\n")

-    for provider in sorted(
-        providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
-    ):
+    for provider in providers_sorted.keys():
        provider_models = providers_sorted[provider]  # Use sorted models
        if not provider_models:
            continue
@ -461,60 +437,62 @@ def main():
        "--providers",
        type=str,
        nargs="+",
-        help="Specify providers to test (comma-separated or space-separated, default: all)",
+        help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
    )
    parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
    parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
    args = parser.parse_args()

    all_results = {}
-    # Initialize collections to aggregate results in main
-    aggregated_providers = defaultdict(set)
+    final_providers_order = {}  # Dictionary to store results, preserving processing order
    aggregated_tests = set()
    provider_timestamps = {}

-    if args.run_tests:
-        # Get list of available providers from command line or use detected providers
-        if args.providers:
-            # Handle both comma-separated and space-separated lists
-            test_providers = []
-            for provider_arg in args.providers:
-                # Split by comma if commas are present
-                if "," in provider_arg:
-                    test_providers.extend(provider_arg.split(","))
-                else:
-                    test_providers.append(provider_arg)
-        else:
-            # Default providers to test
-            test_providers = PROVIDER_ORDER
-
-        for provider in test_providers:
-            provider = provider.strip()  # Remove any whitespace
-            result_file = run_tests(provider, keyword=args.k)
-            if result_file:
-                # Parse and aggregate results
-                parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
-                all_results.update(parsed_results)
-                for prov, models in providers_in_file.items():
-                    aggregated_providers[prov].update(models)
-                    if run_timestamp:
-                        provider_timestamps[prov] = run_timestamp
-                aggregated_tests.update(tests_in_file)
+    # 1. Determine the desired list and order of providers
+    if args.providers:
+        desired_providers = []
+        for provider_arg in args.providers:
+            desired_providers.extend([p.strip() for p in provider_arg.split(",")])
    else:
-        # Use existing results
-        provider_result_files = get_all_result_files_by_provider()
+        desired_providers = DEFAULT_PROVIDERS  # Use default order/list

-        for result_file in provider_result_files.values():
-            # Parse and aggregate results
-            parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
-            all_results.update(parsed_results)
-            for prov, models in providers_in_file.items():
-                aggregated_providers[prov].update(models)
-                if run_timestamp:
-                    provider_timestamps[prov] = run_timestamp
-            aggregated_tests.update(tests_in_file)
+    # 2. Run tests if requested (using the desired provider list)
+    if args.run_tests:
+        run_multiple_tests(desired_providers, args.k)

-    generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)
+    for provider in desired_providers:
+        # Construct the expected result file path directly
+        result_file = RESULTS_DIR / f"{provider}.json"
+
+        if result_file.exists():  # Check if the specific file exists
+            print(f"Loading results for {provider} from {result_file}")
+            try:
+                parsed_data = parse_results(result_file)
+                parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
+                all_results.update(parsed_results)
+                aggregated_tests.update(tests_in_file)
+
+                # Add models for this provider, ensuring it's added in the correct report order
+                if provider in providers_in_file:
+                    if provider not in final_providers_order:
+                        final_providers_order[provider] = set()
+                    final_providers_order[provider].update(providers_in_file[provider])
+                    if run_timestamp != "Unknown":
+                        provider_timestamps[provider] = run_timestamp
+                else:
+                    print(
+                        f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
+                    )
+
+            except Exception as e:
+                print(f"Error parsing results for provider {provider} from {result_file}: {e}")
+        else:
+            # Only print warning if we expected results (i.e., provider was in the desired list)
+            print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
+
+    # 5. Generate the report using the filtered & ordered results
+    print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
+    generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)


 if __name__ == "__main__":
--- a/tests/verifications/test_results/fireworks.json
+++ b/tests/verifications/test_results/fireworks.json
--- a/tests/verifications/test_results/openai.json
+++ b/tests/verifications/test_results/openai.json
--- a/tests/verifications/test_results/together.json
+++ b/tests/verifications/test_results/together.json
--- a/uv.lock
+++ b/uv.lock
@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1410,6 +1411,7 @@ dev = [
    { name = "pytest-asyncio" },
    { name = "pytest-cov" },
    { name = "pytest-html" },
+    { name = "pytest-json-report" },
    { name = "ruamel-yaml" },
    { name = "ruff" },
    { name = "types-requests" },
@ -1502,6 +1504,7 @@ requires-dist = [
    { name = "pytest-asyncio", marker = "extra == 'dev'" },
    { name = "pytest-cov", marker = "extra == 'dev'" },
    { name = "pytest-html", marker = "extra == 'dev'" },
+    { name = "pytest-json-report", marker = "extra == 'dev'" },
    { name = "python-dotenv" },
    { name = "qdrant-client", marker = "extra == 'unit'" },
    { name = "requests" },
@ -1531,6 +1534,7 @@ requires-dist = [
    { name = "types-setuptools", marker = "extra == 'dev'" },
    { name = "uvicorn", marker = "extra == 'dev'" },
 ]
+provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]

 [[package]]
 name = "llama-stack-client"
@ -2740,6 +2744,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c8/c7/c160021cbecd956cc1a6f79e5fe155f7868b2e5b848f1320dad0b3e3122f/pytest_html-4.1.1-py3-none-any.whl", hash = "sha256:c8152cea03bd4e9bee6d525573b67bbc6622967b72b9628dda0ea3e2a0b5dd71", size = 23491 },
 ]

+[[package]]
+name = "pytest-json-report"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "pytest-metadata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4f/d3/765dae9712fcd68d820338908c1337e077d5fdadccd5cacf95b9b0bea278/pytest-json-report-1.5.0.tar.gz", hash = "sha256:2dde3c647851a19b5f3700729e8310a6e66efb2077d674f27ddea3d34dc615de", size = 21241 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/35/d07400c715bf8a88aa0c1ee9c9eb6050ca7fe5b39981f0eea773feeb0681/pytest_json_report-1.5.0-py3-none-any.whl", hash = "sha256:9897b68c910b12a2e48dd849f9a284b2c79a732a8a9cb398452ddd23d3c8c325", size = 13222 },
+]
+
 [[package]]
 name = "pytest-metadata"
 version = "3.1.1"