Merge branch 'main' into eval_api_final

2025-03-17 17:00:30 -07:00 · 2025-03-17 17:00:30 -07:00 · 66cd83fb58
commit 66cd83fb58
parent 62abe2899a 5287b437ae
37 changed files with 1215 additions and 840 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,9 +1,18 @@
 name: Integration tests

 on:
-  pull_request:
  push:
-    branches: [main]
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'distributions/**'
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-tests.yml' # This workflow

 jobs:
  ollama:
@ -56,8 +65,7 @@ jobs:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          source .venv/bin/activate
-          # TODO: use "llama stack run"
-          nohup uv run python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml > server.log 2>&1 &
+          nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -40,6 +40,7 @@ jobs:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
+      fail-fast: false # We want to run all jobs even if some fail

    steps:
      - name: Checkout repository
@ -67,7 +68,9 @@ jobs:

      - name: Run Llama Stack Build
        run: |
-          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test

      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -5,6 +5,14 @@ on:
    branches: [ main ]
  pull_request:
    branches: [ main ]
+    paths:
+      - 'distributions/**'
+      - 'llama_stack/**'
+      - 'tests/unit/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/unit-tests.yml' # This workflow
  workflow_dispatch:

 jobs:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -77,7 +77,7 @@ repos:
        name: Distribution Template Codegen
        additional_dependencies:
          - uv==0.6.0
-        entry: uv run --extra codegen python -m llama_stack.scripts.distro_codegen
+        entry: uv run --extra codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -159,7 +159,7 @@ LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama

 ### Updating Provider Configurations

-If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
+If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.

 ### Building the Documentation

--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -401,16 +401,13 @@
  ],
  "nvidia": [
    "aiosqlite",
-    "autoevals",
    "blobfile",
    "chardet",
-    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
-    "mcp",
    "nltk",
    "numpy",
    "openai",
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2233,6 +2233,67 @@
            }
        },
        "/v1/datasetio/iterrows/{dataset_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/IterrowsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "DatasetIO"
+                ],
+                "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
+                "parameters": [
+                    {
+                        "name": "dataset_id",
+                        "in": "path",
+                        "description": "The ID of the dataset to get the rows from.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "start_index",
+                        "in": "query",
+                        "description": "Index into dataset for the first row to get. Get all rows if None.",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "The number of rows to get.",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
@ -6552,100 +6613,14 @@
                        "const": "factuality",
                        "default": "factuality"
                    },
-                    "factuality": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "factuality"
-                ],
-                "title": "FactualityGrader"
-            },
-            "FaithfulnessGrader": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "faithfulness",
-                        "default": "faithfulness"
-                    },
-                    "faithfulness": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "faithfulness"
-                ],
-                "title": "FaithfulnessGrader"
-            },
-            "Grader": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
+                    "dataset_id": {
                        "type": "string"
                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "grader",
-                        "default": "grader"
-                    },
-                    "grader": {
-                        "$ref": "#/components/schemas/GraderDefinition"
-                    },
-                    "description": {
-                        "type": "string"
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
                    },
                    "metadata": {
                        "type": "object",
@ -6679,98 +6654,163 @@
                    "provider_resource_id",
                    "provider_id",
                    "type",
-                    "grader",
+                    "dataset_id",
+                    "scoring_functions",
                    "metadata"
                ],
-                "title": "Grader"
+                "title": "Benchmark"
            },
-            "GraderDefinition": {
+            "DataSource": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/LlmGrader"
+                        "$ref": "#/components/schemas/URIDataSource"
                    },
                    {
-                        "$ref": "#/components/schemas/RegexParserGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/EqualityGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/SubsetOfGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FactualityGrader"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FaithfulnessGrader"
+                        "$ref": "#/components/schemas/RowsDataSource"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
-                        "llm": "#/components/schemas/LlmGrader",
-                        "regex_parser": "#/components/schemas/RegexParserGrader",
-                        "equality": "#/components/schemas/EqualityGrader",
-                        "subset_of": "#/components/schemas/SubsetOfGrader",
-                        "factuality": "#/components/schemas/FactualityGrader",
-                        "faithfulness": "#/components/schemas/FaithfulnessGrader"
+                        "uri": "#/components/schemas/URIDataSource",
+                        "rows": "#/components/schemas/RowsDataSource"
                    }
                }
            },
-            "LlmGrader": {
+            "Grader": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "grader",
+                        "default": "grader"
+                    },
+                    "purpose": {
+                        "type": "string",
+                        "enum": [
+                            "post-training/messages",
+                            "eval/question-answer",
+                            "eval/messages-answer"
+                        ],
+                        "title": "DatasetPurpose",
+                        "description": "Purpose of the dataset. Each purpose has a required input data schema."
+                    },
+                    "source": {
+                        "$ref": "#/components/schemas/DataSource"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "purpose",
+                    "source",
+                    "metadata"
+                ],
+                "title": "Dataset"
+            },
+            "RowsDataSource": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
-                        "const": "llm",
-                        "default": "llm"
+                        "const": "rows",
+                        "default": "rows"
                    },
-                    "llm": {
-                        "type": "object",
-                        "properties": {
-                            "model": {
-                                "type": "string"
-                            },
-                            "prompt": {
-                                "type": "string"
-                            },
-                            "score_regexes": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            },
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
+                    "rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
                            }
                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "model",
-                            "prompt",
-                            "score_regexes",
-                            "aggregation_functions"
-                        ],
-                        "title": "LlmGraderParams"
+                        "description": "The dataset is stored in rows. E.g. - [ {\"messages\": [{\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}]} ]"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
-                    "llm"
+                    "rows"
                ],
-                "title": "LlmGrader"
+                "title": "RowsDataSource",
+                "description": "A dataset stored in rows."
+            },
+            "URIDataSource": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "uri",
+                        "default": "uri"
+                    },
+                    "uri": {
+                        "type": "string",
+                        "description": "The dataset can be obtained from a URI. E.g. - \"https://mywebsite.com/mydata.jsonl\" - \"lsfs://mydata.jsonl\" - \"data:csv;base64,{base64_content}\""
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "uri"
+                ],
+                "title": "URIDataSource",
+                "description": "A dataset that can be obtained from a URI."
            },
            "RegexParserGrader": {
                "type": "object",
@ -6819,45 +6859,182 @@
                ],
                "title": "RegexParserGrader"
            },
-            "SubsetOfGrader": {
+            "ModelType": {
+                "type": "string",
+                "enum": [
+                    "llm",
+                    "embedding"
+                ],
+                "title": "ModelType"
+            },
+            "AgentTurnInputType": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
-                        "const": "subset_of",
-                        "default": "subset_of"
-                    },
-                    "subset_of": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "aggregation_functions"
-                        ],
-                        "title": "BasicGraderParams"
+                        "const": "agent_turn_input",
+                        "default": "agent_turn_input"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "type",
-                    "subset_of"
+                    "type"
                ],
-                "title": "SubsetOfGrader"
+                "title": "AgentTurnInputType"
+            },
+            "ArrayType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "array",
+                        "default": "array"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ArrayType"
+            },
+            "BooleanType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "boolean",
+                        "default": "boolean"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "BooleanType"
+            },
+            "ChatCompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "chat_completion_input",
+                        "default": "chat_completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ChatCompletionInputType"
+            },
+            "CompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "completion_input",
+                        "default": "completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "CompletionInputType"
+            },
+            "JsonType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json",
+                        "default": "json"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "JsonType"
+            },
+            "NumberType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "number",
+                        "default": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "NumberType"
+            },
+            "ObjectType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "object",
+                        "default": "object"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ObjectType"
+            },
+            "ParamType": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/StringType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/NumberType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BooleanType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ArrayType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ObjectType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/JsonType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/UnionType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ChatCompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnInputType"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "string": "#/components/schemas/StringType",
+                        "number": "#/components/schemas/NumberType",
+                        "boolean": "#/components/schemas/BooleanType",
+                        "array": "#/components/schemas/ArrayType",
+                        "object": "#/components/schemas/ObjectType",
+                        "json": "#/components/schemas/JsonType",
+                        "union": "#/components/schemas/UnionType",
+                        "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
+                        "completion_input": "#/components/schemas/CompletionInputType",
+                        "agent_turn_input": "#/components/schemas/AgentTurnInputType"
+                    }
+                }
            },
            "Model": {
                "type": "object",
@ -6913,17 +7090,39 @@
                    "provider_id",
                    "type",
                    "metadata",
-                    "model_type"
+                    "return_type"
                ],
-                "title": "Model"
+                "title": "ScoringFn"
            },
-            "ModelType": {
-                "type": "string",
-                "enum": [
-                    "llm",
-                    "embedding"
+            "StringType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "string",
+                        "default": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
                ],
-                "title": "ModelType"
+                "title": "StringType"
+            },
+            "UnionType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "union",
+                        "default": "union"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "UnionType"
            },
            "Shield": {
                "type": "object",
@ -8131,7 +8330,7 @@
                        },
                        "description": "The rows in the current page."
                    },
-                    "next_index": {
+                    "next_start_index": {
                        "type": "integer",
                        "description": "Index into dataset for the first row in the next page. None if there are no more rows."
                    }
@ -9440,7 +9639,7 @@
                    },
                    "source": {
                        "$ref": "#/components/schemas/DataSource",
-                        "description": "The data source of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
+                        "description": "The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples: - { \"type\": \"uri\", \"uri\": \"https://mywebsite.com/mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"lsfs://mydata.jsonl\" } - { \"type\": \"uri\", \"uri\": \"data:csv;base64,{base64_content}\" } - { \"type\": \"uri\", \"uri\": \"huggingface://llamastack/simpleqa?split=train\" } - { \"type\": \"rows\", \"rows\": [ { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } ] }"
                    },
                    "metadata": {
                        "type": "object",
@ -9478,50 +9677,6 @@
                    "purpose",
                    "source"
                ],
-                "title": "RegisterDatasetRequest"
-            },
-            "RegisterGraderRequest": {
-                "type": "object",
-                "properties": {
-                    "grader": {
-                        "$ref": "#/components/schemas/GraderDefinition",
-                        "description": "The grader definition, E.g. - { \"type\": \"llm\", \"llm\": { \"model\": \"llama-405b\", \"prompt\": \"You are a judge. Score the answer based on the question. {question} {answer}\", } }"
-                    },
-                    "grader_id": {
-                        "type": "string",
-                        "description": "(Optional) The ID of the grader. If not provided, a random ID will be generated."
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "(Optional) Any additional metadata for this grader. - E.g. { \"description\": \"A grader that scores the answer based on the question.\", }"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "grader"
-                ],
                "title": "RegisterGraderRequest"
            },
            "RegisterModelRequest": {
@ -10199,9 +10354,6 @@
        {
            "name": "Files"
        },
-        {
-            "name": "Graders"
-        },
        {
            "name": "Inference",
            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@ -10254,9 +10406,8 @@
                "Benchmarks",
                "DatasetIO",
                "Datasets",
-                "Evaluation",
+                "Eval",
                "Files",
-                "Graders",
                "Inference",
                "Inspect",
                "Models",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1507,6 +1507,50 @@ paths:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
  /v1/datasetio/iterrows/{dataset_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IterrowsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - DatasetIO
+      description: >-
+        Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+      parameters:
+        - name: dataset_id
+          in: path
+          description: >-
+            The ID of the dataset to get the rows from.
+          required: true
+          schema:
+            type: string
+        - name: start_index
+          in: query
+          description: >-
+            Index into dataset for the first row to get. Get all rows if None.
+          required: false
+          schema:
+            type: integer
+        - name: limit
+          in: query
+          description: The number of rows to get.
+          required: false
+          schema:
+            type: integer
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -4527,255 +4571,6 @@ components:
      title: URIDataSource
      description: >-
        A dataset that can be obtained from a URI.
-    EqualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: equality
-          default: equality
-        equality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - equality
-      title: EqualityGrader
-    FactualityGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: factuality
-          default: factuality
-        factuality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - factuality
-      title: FactualityGrader
-    FaithfulnessGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: faithfulness
-          default: faithfulness
-        faithfulness:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - faithfulness
-      title: FaithfulnessGrader
-    Grader:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: grader
-          default: grader
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-        description:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - grader
-        - metadata
-      title: Grader
-    GraderDefinition:
-      oneOf:
-        - $ref: '#/components/schemas/LlmGrader'
-        - $ref: '#/components/schemas/RegexParserGrader'
-        - $ref: '#/components/schemas/EqualityGrader'
-        - $ref: '#/components/schemas/SubsetOfGrader'
-        - $ref: '#/components/schemas/FactualityGrader'
-        - $ref: '#/components/schemas/FaithfulnessGrader'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm: '#/components/schemas/LlmGrader'
-          regex_parser: '#/components/schemas/RegexParserGrader'
-          equality: '#/components/schemas/EqualityGrader'
-          subset_of: '#/components/schemas/SubsetOfGrader'
-          factuality: '#/components/schemas/FactualityGrader'
-          faithfulness: '#/components/schemas/FaithfulnessGrader'
-    LlmGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-        llm:
-          type: object
-          properties:
-            model:
-              type: string
-            prompt:
-              type: string
-            score_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - model
-            - prompt
-            - score_regexes
-            - aggregation_functions
-          title: LlmGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - llm
-      title: LlmGrader
-    RegexParserGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        regex_parser:
-          type: object
-          properties:
-            parsing_regexes:
-              type: array
-              items:
-                type: string
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - parsing_regexes
-            - aggregation_functions
-          title: RegexParserGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - regex_parser
-      title: RegexParserGrader
-    SubsetOfGrader:
-      type: object
-      properties:
-        type:
-          type: string
-          const: subset_of
-          default: subset_of
-        subset_of:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-          additionalProperties: false
-          required:
-            - aggregation_functions
-          title: BasicGraderParams
-      additionalProperties: false
-      required:
-        - type
-        - subset_of
-      title: SubsetOfGrader
    Model:
      type: object
      properties:
@ -4817,6 +4612,224 @@ components:
        - llm
        - embedding
      title: ModelType
+    AgentTurnInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent_turn_input
+          default: agent_turn_input
+      additionalProperties: false
+      required:
+        - type
+      title: AgentTurnInputType
+    ArrayType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: array
+          default: array
+      additionalProperties: false
+      required:
+        - type
+      title: ArrayType
+    BooleanType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: boolean
+          default: boolean
+      additionalProperties: false
+      required:
+        - type
+      title: BooleanType
+    ChatCompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: chat_completion_input
+          default: chat_completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: ChatCompletionInputType
+    CompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: completion_input
+          default: completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: CompletionInputType
+    JsonType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: rows
+          default: rows
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The dataset is stored in rows. E.g. - [ {"messages": [{"role": "user",
+            "content": "Hello, world!"}, {"role": "assistant", "content": "Hello,
+            world!"}]} ]
+      additionalProperties: false
+      required:
+        - type
+        - rows
+      title: RowsDataSource
+      description: A dataset stored in rows.
+    URIDataSource:
+      type: object
+      properties:
+        type:
+          type: string
+          const: uri
+          default: uri
+        uri:
+          type: string
+          description: >-
+            The dataset can be obtained from a URI. E.g. - "https://mywebsite.com/mydata.jsonl"
+            - "lsfs://mydata.jsonl" - "data:csv;base64,{base64_content}"
+      additionalProperties: false
+      required:
+        - type
+        - uri
+      title: URIDataSource
+      description: >-
+        A dataset that can be obtained from a URI.
+    EqualityGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: equality
+          default: equality
+        equality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+      title: ObjectType
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+        - $ref: '#/components/schemas/AgentTurnInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+          agent_turn_input: '#/components/schemas/AgentTurnInputType'
+    ScoringFn:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: scoring_function
+          default: scoring_function
+        description:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - grader
+        - metadata
+        - return_type
+      title: ScoringFn
+    StringType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: string
+          default: string
+      additionalProperties: false
+      required:
+        - type
+      title: StringType
+    UnionType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: union
+          default: union
+      additionalProperties: false
+      required:
+        - type
+      title: UnionType
    Shield:
      type: object
      properties:
@ -5580,7 +5593,7 @@ components:
                - type: array
                - type: object
          description: The rows in the current page.
-        next_index:
+        next_start_index:
          type: integer
          description: >-
            Index into dataset for the first row in the next page. None if there are
@ -6461,12 +6474,14 @@ components:
        source:
          $ref: '#/components/schemas/DataSource'
          description: >-
-            The data source of the dataset. Examples: - { "type": "uri", "uri": "https://mywebsite.com/mydata.jsonl"
-            } - { "type": "uri", "uri": "lsfs://mydata.jsonl" } - { "type": "uri",
-            "uri": "data:csv;base64,{base64_content}" } - { "type": "uri", "uri":
-            "huggingface://llamastack/simpleqa?split=train" } - { "type": "rows",
-            "rows": [ { "messages": [ {"role": "user", "content": "Hello, world!"},
-            {"role": "assistant", "content": "Hello, world!"}, ] } ] }
+            The data source of the dataset. Ensure that the data source schema is
+            compatible with the purpose of the dataset. Examples: - { "type": "uri",
+            "uri": "https://mywebsite.com/mydata.jsonl" } - { "type": "uri", "uri":
+            "lsfs://mydata.jsonl" } - { "type": "uri", "uri": "data:csv;base64,{base64_content}"
+            } - { "type": "uri", "uri": "huggingface://llamastack/simpleqa?split=train"
+            } - { "type": "rows", "rows": [ { "messages": [ {"role": "user", "content":
+            "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ]
+            } ] }
        metadata:
          type: object
          additionalProperties:
@ -6488,37 +6503,6 @@ components:
        - purpose
        - source
      title: RegisterDatasetRequest
-    RegisterGraderRequest:
-      type: object
-      properties:
-        grader:
-          $ref: '#/components/schemas/GraderDefinition'
-          description: >-
-            The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b",
-            "prompt": "You are a judge. Score the answer based on the question. {question}
-            {answer}", } }
-        grader_id:
-          type: string
-          description: >-
-            (Optional) The ID of the grader. If not provided, a random ID will be
-            generated.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            (Optional) Any additional metadata for this grader. - E.g. { "description":
-            "A grader that scores the answer based on the question.", }
-      additionalProperties: false
-      required:
-        - grader
-      title: RegisterGraderRequest
    RegisterModelRequest:
      type: object
      properties:
@ -6951,9 +6935,10 @@ tags:
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
-  - name: Evaluation
+  - name: Eval
+    x-displayName: >-
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files
-  - name: Graders
  - name: Inference
    description: >-
      This API provides the raw interface to the underlying models. Two kinds of models
@ -6988,9 +6973,8 @@ x-tagGroups:
      - Benchmarks
      - DatasetIO
      - Datasets
-      - Evaluation
+      - Eval
      - Files
-      - Graders
      - Inference
      - Inspect
      - Models
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.


 Here are some example PRs to help you get started:
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -6,13 +6,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
+| datasetio | `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::nvidia` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| safety | `remote::nvidia` |
+| scoring | `inline::basic` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `inline::rag-runtime` |
 | vector_io | `inline::faiss` |


@ -20,8 +20,10 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
+- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
+- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

 ### Models

--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -6,17 +6,32 @@ The `llama-stack-client` CLI allows you to query information about the distribut

 ### `llama-stack-client`
 ```bash
-llama-stack-client -h
+llama-stack-client
+Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...

-usage: llama-stack-client [-h] {models,memory_banks,shields} ...
+  Welcome to the LlamaStackClient CLI

-Welcome to the LlamaStackClient CLI
+Options:
+  --version        Show the version and exit.
+  --endpoint TEXT  Llama Stack distribution endpoint
+  --api-key TEXT   Llama Stack distribution API key
+  --config TEXT    Path to config file
+  --help           Show this message and exit.

-options:
-  -h, --help            show this help message and exit
-
-subcommands:
-  {models,memory_banks,shields}
+Commands:
+  configure          Configure Llama Stack Client CLI.
+  datasets           Manage datasets.
+  eval               Run evaluation tasks.
+  eval_tasks         Manage evaluation tasks.
+  inference          Inference (chat).
+  inspect            Inspect server configuration.
+  models             Manage GenAI models.
+  post_training      Post-training.
+  providers          Manage API providers.
+  scoring_functions  Manage scoring functions.
+  shields            Manage safety shield services.
+  toolgroups         Manage available tool groups.
+  vector_dbs         Manage vector databases.
 ```

 ### `llama-stack-client configure`
@ -127,11 +142,11 @@ llama-stack-client vector_dbs list
 llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```

-Options:
- `--provider-id`: Optional. Provider ID for the vector db
- `--provider-vector-db-id`: Optional. Provider's vector db ID
- `--embedding-model`: Optional. Embedding model to use. Default: "all-MiniLM-L6-v2"
- `--embedding-dimension`: Optional. Dimension of embeddings. Default: 384
+Optional arguments:
+- `--provider-id`: Provider ID for the vector db
+- `--provider-vector-db-id`: Provider's vector db ID
+- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
+- `--embedding-dimension`: Dimension of embeddings. Default: 384

 ### `llama-stack-client vector_dbs unregister`
 ```bash
@ -157,11 +172,13 @@ llama-stack-client shields list
 llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
 ```

-Options:
- `--shield-id`: Required. ID of the shield
- `--provider-id`: Optional. Provider ID for the shield
- `--provider-shield-id`: Optional. Provider's shield ID
- `--params`: Optional. JSON configuration parameters for the shield
+Required arguments:
+- `--shield-id`: ID of the shield
+
+Optional arguments:
+- `--provider-id`: Provider ID for the shield
+- `--provider-shield-id`: Provider's shield ID
+- `--params`: JSON configuration parameters for the shield

 ## Eval Task Management

@ -175,13 +192,15 @@ llama-stack-client benchmarks list
 llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```

-Options:
- `--eval-task-id`: Required. ID of the eval task
- `--dataset-id`: Required. ID of the dataset to evaluate
- `--scoring-functions`: Required. One or more scoring functions to use for evaluation
- `--provider-id`: Optional. Provider ID for the eval task
- `--provider-eval-task-id`: Optional. Provider's eval task ID
- `--metadata`: Optional. Metadata for the eval task in JSON format
+Required arguments:
+- `--eval-task-id`: ID of the eval task
+- `--dataset-id`: ID of the dataset to evaluate
+- `--scoring-functions`: One or more scoring functions to use for evaluation
+
+Optional arguments:
+- `--provider-id`: Provider ID for the eval task
+- `--provider-eval-task-id`: Provider's eval task ID
+- `--metadata`: Metadata for the eval task in JSON format

 ## Eval execution
 ### `llama-stack-client eval run-benchmark`
@ -189,11 +208,13 @@ Options:
 llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

-Options:
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
- `--output-dir`: Required. Path to the directory where evaluation results will be saved
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
+Required arguments:
+- `--eval-task-config`: Path to the eval task config file in JSON format
+- `--output-dir`: Path to the directory where evaluation results will be saved
+
+Optional arguments:
+- `--num-examples`: Number of examples to evaluate (useful for debugging)
+- `--visualize`: If set, visualizes evaluation results after completion

 Example benchmark_config.json:
 ```json
@ -214,11 +235,13 @@ Example benchmark_config.json:
 llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
 ```

-Options:
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
- `--output-dir`: Required. Path to the directory where scoring results will be saved
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes scoring results after completion
+Required arguments:
+- `--eval-task-config`: Path to the eval task config file in JSON format
+- `--output-dir`: Path to the directory where scoring results will be saved
+
+Optional arguments:
+- `--num-examples`: Number of examples to evaluate (useful for debugging)
+- `--visualize`: If set, visualizes scoring results after completion

 ## Tool Group Management

@ -230,11 +253,11 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
-| builtin::code_interpreter | code-interpreter | None | None         |
+| builtin::code_interpreter | code-interpreter | None | None          |
 +---------------------------+------------------+------+---------------+
-| builtin::rag             | rag-runtime      | None | None         |
+| builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
-| builtin::websearch       | tavily-search    | None | None         |
+| builtin::websearch        | tavily-search    | None | None          |
 +---------------------------+------------------+------+---------------+
 ```

@ -250,11 +273,11 @@ Shows detailed information about a specific toolgroup. If the toolgroup is not f
 llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
 ```

-Options:
- `--provider-id`: Optional. Provider ID for the toolgroup
- `--provider-toolgroup-id`: Optional. Provider's toolgroup ID
- `--mcp-config`: Optional. JSON configuration for the MCP endpoint
- `--args`: Optional. JSON arguments for the toolgroup
+Optional arguments:
+- `--provider-id`: Provider ID for the toolgroup
+- `--provider-toolgroup-id`: Provider's toolgroup ID
+- `--mcp-config`: JSON configuration for the MCP endpoint
+- `--args`: JSON arguments for the toolgroup

 ### `llama-stack-client toolgroups unregister`
 ```bash
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -18,11 +18,11 @@ class IterrowsResponse(BaseModel):
    A paginated list of rows from a dataset.

    :param data: The rows in the current page.
-    :param next_index: Index into dataset for the first row in the next page. None if there are no more rows.
+    :param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
    """

    data: List[Dict[str, Any]]
-    next_index: Optional[int] = None
+    next_start_index: Optional[int] = None


 class DatasetStore(Protocol):
@ -46,9 +46,11 @@ class DatasetIO(Protocol):

        :param dataset_id: The ID of the dataset to get the rows from.
        :param start_index: Index into dataset for the first row to get. Get all rows if None.
-        :param limit: The number of rows to get per page.
+        :param limit: The number of rows to get.
        """
        ...

    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
+    async def append_rows(
+        self, dataset_id: str, rows: List[Dict[str, Any]]
+    ) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -163,7 +163,7 @@ class Datasets(Protocol):
                    ],
                    "answer": "John Doe"
                }
-        :param source: The data source of the dataset. Examples:
+        :param source: The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples:
           - {
               "type": "uri",
               "uri": "https://mywebsite.com/mydata.jsonl"
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -38,7 +38,7 @@ from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
-from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty
+from llama_stack.distribution.utils.exec import formulate_run_args, run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api

@ -213,7 +213,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        config = parse_and_maybe_upgrade_config(config_dict)
        run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
        run_args.extend([run_config, str(os.getenv("LLAMA_STACK_PORT", 8321))])
-        run_with_pty(run_args)
+        run_command(run_args)


 def _generate_run_config(
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -82,7 +82,7 @@ class StackRun(Subcommand):

        from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
-        from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty
+        from llama_stack.distribution.utils.exec import formulate_run_args, run_command

        config_file = Path(args.config)
        has_yaml_suffix = args.config.endswith(".yaml")
@ -136,4 +136,4 @@ class StackRun(Subcommand):

        if args.tls_keyfile and args.tls_certfile:
            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
-        run_with_pty(run_args)
+        run_command(run_args)
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -6,7 +6,6 @@

 import importlib.resources
 import logging
-import sys
 from pathlib import Path
 from typing import Dict, List

@ -15,7 +14,7 @@ from termcolor import cprint

 from llama_stack.distribution.datatypes import BuildConfig, Provider
 from llama_stack.distribution.distribution import get_provider_registry
-from llama_stack.distribution.utils.exec import run_command, run_with_pty
+from llama_stack.distribution.utils.exec import run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api

@ -123,11 +122,7 @@ def build_image(
    if special_deps:
        args.append("#".join(special_deps))

-    is_terminal = sys.stdin.isatty()
-    if is_terminal:
-        return_code = run_with_pty(args)
-    else:
-        return_code = run_command(args)
+    return_code = run_command(args)

    if return_code != 0:
        log.error(
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -43,7 +43,7 @@ RED='\033[0;31m'
 NC='\033[0m' # No Color

 CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
-CONTAINER_OPTS=${CONTAINER_OPTS:-}
+CONTAINER_OPTS=${CONTAINER_OPTS:---progress=plain}

 TEMP_DIR=$(mktemp -d)

@ -253,8 +253,7 @@ $CONTAINER_BINARY build \
  "${CLI_ARGS[@]}" \
  -t "$image_tag" \
  -f "$TEMP_DIR/Containerfile" \
-  "." \
-  --progress=plain
+  "."

 # clean up tmp/configs
 set +x
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
@ -8,10 +8,13 @@
 from pydantic import BaseModel

 from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
+from llama_stack.log import get_logger

 from .datatypes import StackRunConfig
 from .stack import redact_sensitive_fields

+logger = get_logger(name=__name__, category="core")
+

 class ProviderImplConfig(BaseModel):
    run_config: StackRunConfig
@ -31,6 +34,10 @@ class ProviderImpl(Providers):
    async def initialize(self) -> None:
        pass

+    async def shutdown(self) -> None:
+        logger.debug("ProviderImpl.shutdown")
+        pass
+
    async def list_providers(self) -> ListProvidersResponse:
        run_config = self.config.run_config
        safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump()))
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@ -4,13 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import errno
 import logging
 import os
-import select
 import signal
 import subprocess
-import sys

 from termcolor import cprint

@ -88,13 +85,6 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
    return run_args


-def run_with_pty(command):
-    if sys.platform.startswith("win"):
-        return _run_with_pty_win(command)
-    else:
-        return _run_with_pty_unix(command)
-
-
 def in_notebook():
    try:
        from IPython import get_ipython
@ -108,19 +98,19 @@ def in_notebook():
    return True


-# run a command in a pseudo-terminal, with interrupt handling,
-# useful when you want to run interactive things
-def _run_with_pty_unix(command):
-    import pty
-    import termios
+def run_command(command: list[str]) -> int:
+    """
+    Run a command with interrupt handling and output capture.
+    Uses subprocess.run with direct stream piping for better performance.

-    master, slave = pty.openpty()
+    Args:
+        command (list): The command to run.

-    old_settings = termios.tcgetattr(sys.stdin)
+    Returns:
+        int: The return code of the command.
+    """
    original_sigint = signal.getsignal(signal.SIGINT)
-
    ctrl_c_pressed = False
-    process = None

    def sigint_handler(signum, frame):
        nonlocal ctrl_c_pressed
@ -131,106 +121,19 @@ def _run_with_pty_unix(command):
        # Set up the signal handler
        signal.signal(signal.SIGINT, sigint_handler)

-        new_settings = termios.tcgetattr(sys.stdin)
-        new_settings[3] = new_settings[3] & ~termios.ECHO  # Disable echo
-        new_settings[3] = new_settings[3] & ~termios.ICANON  # Disable canonical mode
-        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, new_settings)
-
-        process = subprocess.Popen(
+        # Run the command with stdout/stderr piped directly to system streams
+        result = subprocess.run(
            command,
-            stdin=slave,
-            stdout=slave,
-            stderr=slave,
-            universal_newlines=True,
-            preexec_fn=os.setsid,
+            text=True,
+            check=False,
        )
-
-        # Close the slave file descriptor as it's now owned by the subprocess
-        os.close(slave)
-
-        def handle_io():
-            while not ctrl_c_pressed:
-                try:
-                    rlist, _, _ = select.select([sys.stdin, master], [], [], 0.1)
-
-                    if sys.stdin in rlist:
-                        data = os.read(sys.stdin.fileno(), 1024)
-                        if not data:
-                            break
-                        os.write(master, data)
-
-                    if master in rlist:
-                        data = os.read(master, 1024)
-                        if not data:
-                            break
-                        sys.stdout.buffer.write(data)
-                        sys.stdout.flush()
-
-                except KeyboardInterrupt:
-                    # This will be raised when Ctrl+C is pressed
-                    break
-
-                if process.poll() is not None:
-                    break
-
-        handle_io()
-    except (EOFError, KeyboardInterrupt):
-        pass
-    except OSError as e:
-        if e.errno != errno.EIO:
-            raise
-    finally:
-        # Clean up
-        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
-        signal.signal(signal.SIGINT, original_sigint)
-
-        os.close(master)
-        if process and process.poll() is None:
-            process.terminate()
-            process.wait()
-
-    return process.returncode
-
-
-# run a command in a pseudo-terminal in windows, with interrupt handling,
-def _run_with_pty_win(command):
-    """
-    Runs a command with interactive support using subprocess directly.
-    """
-    try:
-        # For shell scripts on Windows, use appropriate shell
-        if isinstance(command, (list, tuple)):
-            if command[0].endswith(".sh"):
-                if os.path.exists("/usr/bin/bash"):  # WSL
-                    command = ["bash"] + command
-                else:
-                    # Use cmd.exe with bash while preserving all arguments
-                    command = ["cmd.exe", "/c", "bash"] + command
-
-        process = subprocess.Popen(
-            command,
-            shell=True,
-            universal_newlines=True,
-        )
-
-        process.wait()
-
+        return result.returncode
+    except subprocess.SubprocessError as e:
+        log.error(f"Subprocess error: {e}")
+        return 1
    except Exception as e:
-        print(f"Error: {str(e)}")
+        log.exception(f"Unexpected error: {e}")
        return 1
    finally:
-        if process and process.poll() is None:
-            process.terminate()
-            process.wait()
-    return process.returncode
-
-
-def run_command(command):
-    try:
-        result = subprocess.run(command, capture_output=True, text=True, check=True)
-        print("Script Output\n", result.stdout)
-        return result.returncode
-    except subprocess.CalledProcessError as e:
-        print("Error running script:", e)
-        print("Error output:", e.stderr)
-        return e.returncode
+        # Restore the original signal handler
+        signal.signal(signal.SIGINT, original_sigint)
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -44,7 +44,9 @@ class PandasDataframeDataset:
        elif self.dataset_def.source.type == "rows":
            self.df = pandas.DataFrame(self.dataset_def.source.rows)
        else:
-            raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")
+            raise ValueError(
+                f"Unsupported dataset source type: {self.dataset_def.source.type}"
+            )

        if self.df is None:
            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
@ -108,7 +110,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):

        return IterrowsResponse(
            data=rows,
-            next_index=end if end < len(dataset_impl) else None,
+            next_start_index=end if end < len(dataset_impl) else None,
        )

    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
@ -117,4 +119,6 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
        dataset_impl.load()

        new_rows_df = pandas.DataFrame(rows)
-        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
+        dataset_impl.df = pandas.concat(
+            [dataset_impl.df, new_rows_df], ignore_index=True
+        )
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@ -55,4 +55,13 @@ def available_providers() -> List[ProviderSpec]:
                config_class="llama_stack.providers.remote.safety.bedrock.BedrockSafetyConfig",
            ),
        ),
+        remote_provider_spec(
+            api=Api.safety,
+            adapter=AdapterSpec(
+                adapter_type="nvidia",
+                pip_packages=["requests"],
+                module="llama_stack.providers.remote.safety.nvidia",
+                config_class="llama_stack.providers.remote.safety.nvidia.NVIDIASafetyConfig",
+            ),
+        ),
    ]
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@ -86,7 +86,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):

        return IterrowsResponse(
            data=rows,
-            next_index=end if end < len(loaded_dataset) else None,
+            next_start_index=end if end < len(loaded_dataset) else None,
        )

    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
@ -98,9 +98,13 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
        new_dataset = hf_datasets.Dataset.from_list(rows)

        # Concatenate the new rows with existing dataset
-        updated_dataset = hf_datasets.concatenate_datasets([loaded_dataset, new_dataset])
+        updated_dataset = hf_datasets.concatenate_datasets(
+            [loaded_dataset, new_dataset]
+        )

        if dataset_def.metadata.get("path", None):
            updated_dataset.push_to_hub(dataset_def.metadata["path"])
        else:
-            raise NotImplementedError("Uploading to URL-based datasets is not supported yet")
+            raise NotImplementedError(
+                "Uploading to URL-based datasets is not supported yet"
+            )
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -12,6 +12,7 @@ from llama_stack.apis.common.content_types import InterleavedContent
 from llama_stack.apis.inference import (
    ChatCompletionResponse,
    ChatCompletionResponseStreamChunk,
+    CompletionMessage,
    EmbeddingsResponse,
    EmbeddingTaskType,
    Inference,
@ -160,12 +161,14 @@ class PassthroughInferenceAdapter(Inference):
        client = self._get_client()
        response = await client.inference.chat_completion(**json_params)

-        response = response.to_dict()
-
-        # temporary hack to remove the metrics from the response
-        response["metrics"] = []
-
-        return convert_to_pydantic(ChatCompletionResponse, response)
+        return ChatCompletionResponse(
+            completion_message=CompletionMessage(
+                content=response.completion_message.content.text,
+                stop_reason=response.completion_message.stop_reason,
+                tool_calls=response.completion_message.tool_calls,
+            ),
+            logprobs=response.logprobs,
+        )

    async def _stream_chat_completion(self, json_params: Dict[str, Any]) -> AsyncGenerator:
        client = self._get_client()
--- a/llama_stack/providers/remote/safety/nvidia/init.py
+++ b/llama_stack/providers/remote/safety/nvidia/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any
+
+from .config import NVIDIASafetyConfig
+
+
+async def get_adapter_impl(config: NVIDIASafetyConfig, _deps) -> Any:
+    from .nvidia import NVIDIASafetyAdapter
+
+    impl = NVIDIASafetyAdapter(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/remote/safety/nvidia/config.py
+++ b/llama_stack/providers/remote/safety/nvidia/config.py
@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import os
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class NVIDIASafetyConfig(BaseModel):
+    """
+    Configuration for the NVIDIA Guardrail microservice endpoint.
+
+    Attributes:
+        guardrails_service_url (str): A base url for accessing the NVIDIA guardrail endpoint, e.g. http://0.0.0.0:7331
+        config_id (str): The ID of the guardrails configuration to use from the configuration store
+         (https://developer.nvidia.com/docs/nemo-microservices/guardrails/source/guides/configuration-store-guide.html)
+
+    """
+
+    guardrails_service_url: str = Field(
+        default_factory=lambda: os.getenv("GUARDRAILS_SERVICE_URL", "http://0.0.0.0:7331"),
+        description="The url for accessing the guardrails service",
+    )
+    config_id: Optional[str] = Field(default="self-check", description="Config ID to use from the config store")
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+        return {
+            "guardrails_service_url": "${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}",
+            "config_id": "self-check",
+        }
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@ -0,0 +1,154 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+from typing import Any, List, Optional
+
+import requests
+
+from llama_stack.apis.inference import Message
+from llama_stack.apis.safety import RunShieldResponse, Safety, SafetyViolation, ViolationLevel
+from llama_stack.apis.shields import Shield
+from llama_stack.distribution.library_client import convert_pydantic_to_json_value
+from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+
+from .config import NVIDIASafetyConfig
+
+logger = logging.getLogger(__name__)
+
+
+class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
+    def __init__(self, config: NVIDIASafetyConfig) -> None:
+        """
+        Initialize the NVIDIASafetyAdapter with a given safety configuration.
+
+        Args:
+            config (NVIDIASafetyConfig): The configuration containing the guardrails service URL and config ID.
+        """
+        print(f"Initializing NVIDIASafetyAdapter({config.guardrails_service_url})...")
+        self.config = config
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def register_shield(self, shield: Shield) -> None:
+        if not shield.provider_resource_id:
+            raise ValueError("Shield model not provided.")
+
+    async def run_shield(
+        self, shield_id: str, messages: List[Message], params: Optional[dict[str, Any]] = None
+    ) -> RunShieldResponse:
+        """
+        Run a safety shield check against the provided messages.
+
+        Args:
+            shield_id (str): The unique identifier for the shield to be used.
+            messages (List[Message]): A list of Message objects representing the conversation history.
+            params (Optional[dict[str, Any]]): Additional parameters for the shield check.
+
+        Returns:
+            RunShieldResponse: The response containing safety violation details if any.
+
+        Raises:
+            ValueError: If the shield with the provided shield_id is not found.
+        """
+        shield = await self.shield_store.get_shield(shield_id)
+        if not shield:
+            raise ValueError(f"Shield {shield_id} not found")
+
+        self.shield = NeMoGuardrails(self.config, shield.shield_id)
+        return await self.shield.run(messages)
+
+
+class NeMoGuardrails:
+    """
+    A class that encapsulates NVIDIA's guardrails safety logic.
+
+    Sends messages to the guardrails service and interprets the response to determine
+    if a safety violation has occurred.
+    """
+
+    def __init__(
+        self,
+        config: NVIDIASafetyConfig,
+        model: str,
+        threshold: float = 0.9,
+        temperature: float = 1.0,
+    ):
+        """
+        Initialize a NeMoGuardrails instance with the provided parameters.
+
+        Args:
+            config (NVIDIASafetyConfig): The safety configuration containing the config ID and guardrails URL.
+            model (str): The identifier or name of the model to be used for safety checks.
+            threshold (float, optional): The threshold for flagging violations. Defaults to 0.9.
+            temperature (float, optional): The temperature setting for the underlying model. Must be greater than 0. Defaults to 1.0.
+
+        Raises:
+            ValueError: If temperature is less than or equal to 0.
+            AssertionError: If config_id is not provided in the configuration.
+        """
+        self.config_id = config.config_id
+        self.model = model
+        assert self.config_id is not None, "Must provide config id"
+        if temperature <= 0:
+            raise ValueError("Temperature must be greater than 0")
+
+        self.temperature = temperature
+        self.threshold = threshold
+        self.guardrails_service_url = config.guardrails_service_url
+
+    async def run(self, messages: List[Message]) -> RunShieldResponse:
+        """
+        Queries the /v1/guardrails/checks endpoint of the NeMo guardrails deployed API.
+
+        Args:
+            messages (List[Message]): A list of Message objects to be checked for safety violations.
+
+        Returns:
+            RunShieldResponse: If the response indicates a violation ("blocked" status), returns a
+            RunShieldResponse with a SafetyViolation; otherwise, returns a RunShieldResponse with violation set to None.
+
+        Raises:
+            requests.HTTPError: If the POST request fails.
+        """
+        headers = {
+            "Accept": "application/json",
+        }
+        request_data = {
+            "model": self.model,
+            "messages": convert_pydantic_to_json_value(messages),
+            "temperature": self.temperature,
+            "top_p": 1,
+            "frequency_penalty": 0,
+            "presence_penalty": 0,
+            "max_tokens": 160,
+            "stream": False,
+            "guardrails": {
+                "config_id": self.config_id,
+            },
+        }
+        response = requests.post(
+            url=f"{self.guardrails_service_url}/v1/guardrail/checks", headers=headers, json=request_data
+        )
+        response.raise_for_status()
+        if "Content-Type" in response.headers and response.headers["Content-Type"].startswith("application/json"):
+            response_json = response.json()
+        if response_json["status"] == "blocked":
+            user_message = "Sorry I cannot do this."
+            metadata = response_json["rails_status"]
+
+            return RunShieldResponse(
+                violation=SafetyViolation(
+                    user_message=user_message,
+                    violation_level=ViolationLevel.ERROR,
+                    metadata=metadata,
+                )
+            )
+        return RunShieldResponse(violation=None)
--- a/llama_stack/scripts/init.py
+++ b/llama_stack/scripts/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/scripts/run_tests.sh
+++ b/llama_stack/scripts/run_tests.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-THIS_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
-
-set -euo pipefail
-set -x
-
-stack_dir=$(dirname $(dirname $THIS_DIR))
-PYTHONPATH=$stack_dir pytest -p no:warnings --asyncio-mode auto --tb=short
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@ -1,13 +1,13 @@
 version: '2'
 distribution_spec:
-  description: Use NVIDIA NIM for running LLM inference
+  description: Use NVIDIA NIM for running LLM inference and safety
  providers:
    inference:
    - remote::nvidia
    vector_io:
    - inline::faiss
    safety:
-    - inline::llama-guard
+    - remote::nvidia
    agents:
    - inline::meta-reference
    telemetry:
@ -15,16 +15,9 @@ distribution_spec:
    eval:
    - inline::meta-reference
    datasetio:
-    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::code-interpreter
    - inline::rag-runtime
-    - remote::model-context-protocol
 image_type: conda
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -6,9 +6,10 @@

 from pathlib import Path

-from llama_stack.distribution.datatypes import Provider, ToolGroupInput
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
+from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry


@ -16,19 +17,13 @@ def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::nvidia"],
        "vector_io": ["inline::faiss"],
-        "safety": ["inline::llama-guard"],
+        "safety": ["remote::nvidia"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::code-interpreter",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
+        "datasetio": ["inline::localfs"],
+        "scoring": ["inline::basic"],
+        "tool_runtime": ["inline::rag-runtime"],
    }

    inference_provider = Provider(
@ -36,30 +31,35 @@ def get_distribution_template() -> DistributionTemplate:
        provider_type="remote::nvidia",
        config=NVIDIAConfig.sample_run_config(),
    )
+    safety_provider = Provider(
+        provider_id="nvidia",
+        provider_type="remote::nvidia",
+        config=NVIDIASafetyConfig.sample_run_config(),
+    )
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="nvidia",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="nvidia",
+    )

    available_models = {
        "nvidia": MODEL_ENTRIES,
    }
    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
        ToolGroupInput(
            toolgroup_id="builtin::rag",
            provider_id="rag-runtime",
        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
    ]

    default_models = get_model_registry(available_models)
    return DistributionTemplate(
        name="nvidia",
        distro_type="remote_hosted",
-        description="Use NVIDIA NIM for running LLM inference",
+        description="Use NVIDIA NIM for running LLM inference and safety",
        container_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
@ -72,15 +72,34 @@ def get_distribution_template() -> DistributionTemplate:
                default_models=default_models,
                default_tool_groups=default_tool_groups,
            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        safety_provider,
+                    ]
+                },
+                default_models=[inference_model, safety_model],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
+                default_tool_groups=default_tool_groups,
+            ),
        },
        run_config_env_vars={
-            "LLAMASTACK_PORT": (
-                "5001",
-                "Port for the Llama Stack distribution server",
-            ),
            "NVIDIA_API_KEY": (
                "",
                "NVIDIA API Key",
            ),
+            "GUARDRAILS_SERVICE_URL": (
+                "http://0.0.0.0:7331",
+                "URL for the NeMo Guardrails Service",
+            ),
+            "INFERENCE_MODEL": (
+                "Llama3.1-8B-Instruct",
+                "Inference model",
+            ),
+            "SAFETY_MODEL": (
+                "meta/llama-3.1-8b-instruct",
+                "Name of the model to use for safety",
+            ),
        },
    )
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -0,0 +1,101 @@
+version: '2'
+image_name: nvidia
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: nvidia
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:}
+  - provider_id: nvidia
+    provider_type: remote::nvidia
+    config:
+      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}
+      config_id: self-check
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db
+  safety:
+  - provider_id: nvidia
+    provider_type: remote::nvidia
+    config:
+      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}
+      config_id: self-check
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
+  datasetio:
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  tool_runtime:
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: nvidia
+  model_type: llm
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: nvidia
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL}
+  provider_id: nvidia
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -26,10 +26,11 @@ providers:
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db
  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
+  - provider_id: nvidia
+    provider_type: remote::nvidia
    config:
-      excluded_categories: []
+      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}
+      config_id: self-check
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -54,13 +55,6 @@ providers:
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/huggingface_datasetio.db
  - provider_id: localfs
    provider_type: inline::localfs
    config:
@ -72,33 +66,10 @@ providers:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:}
  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
@ -227,11 +198,7 @@ datasets: []
 scoring_fns: []
 benchmarks: []
 tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
  port: 8321
--- a/pyproject.toml
+++ b/pyproject.toml
@ -269,6 +269,7 @@ exclude = [
    "^llama_stack/providers/remote/inference/together/",
    "^llama_stack/providers/remote/inference/vllm/",
    "^llama_stack/providers/remote/safety/bedrock/",
+    "^llama_stack/providers/remote/safety/nvidia/",
    "^llama_stack/providers/remote/safety/sample/",
    "^llama_stack/providers/remote/tool_runtime/bing_search/",
    "^llama_stack/providers/remote/tool_runtime/brave_search/",
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@ -20,7 +21,7 @@ from llama_stack.distribution.build import (
    get_provider_dependencies,
 )

-REPO_ROOT = Path(__file__).parent.parent.parent
+REPO_ROOT = Path(__file__).parent.parent


 class ChangedPathTracker:
--- a/scripts/gen-changelog.py
+++ b/scripts/gen-changelog.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
--- a/llama_stack/scripts/generate_prompt_format.py
+++ b/llama_stack/scripts/generate_prompt_format.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@ -18,7 +19,7 @@ import fire

 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.inline.inference.meta_reference.config import MetaReferenceInferenceConfig
-from llama_stack.providers.inline.inference.meta_reference.generation import Llama
+from llama_stack.providers.inline.inference.meta_reference.llama3.generation import Llama3

 THIS_DIR = Path(__file__).parent.resolve()

@ -41,7 +42,7 @@ def run_main(
    llama_model = resolve_model(model_id)
    if not llama_model:
        raise ValueError(f"Model {model_id} not found")
-    generator = Llama.build(
+    generator = Llama3.build(
        config=config,
        model_id=model_id,
        llama_model=llama_model,
--- a/llama_stack/scripts/run_client_sdk_tests.py
+++ b/llama_stack/scripts/run_client_sdk_tests.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@ -15,8 +16,7 @@ Script for running api on AsyncLlamaStackAsLibraryClient with templates

 Assuming directory structure:
 - llama-stack
-    - llama_stack
-        - scripts
+    - scripts
    - tests
        - api

@ -25,10 +25,10 @@ Example command:
 cd llama-stack
 EXPORT TOGETHER_API_KEY=<..>
 EXPORT FIREWORKS_API_KEY=<..>
-python llama_stack/scripts/run_client_sdk_tests.py --templates together fireworks --report
+./scripts/run_client_sdk_tests.py --templates together fireworks --report
 """

-REPO_ROOT = Path(__file__).parent.parent.parent
+REPO_ROOT = Path(__file__).parent.parent
 CLIENT_SDK_TESTS_RELATIVE_PATH = "tests/api/"