diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 567110829..6b5793119 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6347,7 +6347,36 @@
                         "default": "model"
                     },
                     "model": {
-                        "type": "string",
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "array"
+                                        },
+                                        {
+                                            "type": "object"
+                                        }
+                                    ]
+                                }
+                            }
+                        ],
                         "description": "The model ID to evaluate."
                     },
                     "sampling_params": {
@@ -6362,8 +6391,7 @@
                 "additionalProperties": false,
                 "required": [
                     "type",
-                    "model",
-                    "sampling_params"
+                    "model"
                 ],
                 "title": "ModelCandidate",
                 "description": "A model candidate for evaluation."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1dfd17f55..8c5746900 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4468,7 +4468,17 @@ components:
           const: model
           default: model
         model:
-          type: string
+          oneOf:
+            - type: string
+            - type: object
+              additionalProperties:
+                oneOf:
+                  - type: 'null'
+                  - type: boolean
+                  - type: number
+                  - type: string
+                  - type: array
+                  - type: object
           description: The model ID to evaluate.
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
@@ -4482,7 +4492,6 @@ components:
       required:
         - type
         - model
-        - sampling_params
       title: ModelCandidate
       description: A model candidate for evaluation.
     RegexParserScoringFnParams:
diff --git a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
index e9f2d9a9e..0be64073c 100644
--- a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA."
+    "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using the NVIDIA provider."
    ]
   },
   {
@@ -12,7 +12,49 @@
    "metadata": {},
    "source": [
     "## Prerequisites\n",
-    "- Please reference <TODO: Add docs link> to setup the NVIDIA platform. "
+    "First, ensure the NeMo Microservices platform is up and running, including the model downloading step for `meta/llama-3.2-1b-instruct`. See installation instructions: https://aire.gitlab-master-pages.nvidia.com/microservices/documentation/latest/nemo-microservices/latest-internal/set-up/deploy-as-platform/index.html (TODO: Update to public docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, set up your development environment on your machine. From the root of the project, set up your virtual environment:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "uv sync --extra dev\n",
+    "uv pip install -e .\n",
+    "source .venv/bin/activate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the Llama Stack image using the virtual environment. For local development, set `LLAMA_STACK_DIR` to ensure your local code is use in the image. To use the production version of `llama-stack`, omit `LLAMA_STACK_DIR`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv"
    ]
   },
   {
@@ -22,9 +64,23 @@
     "## Setup\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Configure the environment variables for each service.\n",
+    "\n",
+    "If needed, update the URLs for each service to point to your deployment.\n",
+    "- NDS_URL: NeMo Data Store URL\n",
+    "- NEMO_URL: NeMo Microservices Platform URL\n",
+    "- NIM_URL: NIM URL\n",
+    "\n",
+    "For more infomation about these variables, please reference the [NVIDIA Distro documentation](docs/source/distributions/remote_hosted_distro/nvidia.md)."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,13 +91,13 @@
     "NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
     "NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
     "\n",
-    "# Inference env vars\n",
-    "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
-    "\n",
     "USER_ID = \"llama-stack-user\"\n",
     "NAMESPACE = \"default\"\n",
-    "PROJECT_ID = \"test-project\"\n",
-    "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n",
+    "PROJECT_ID = \"\"\n",
+    "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v2\"\n",
+    "\n",
+    "# Inference env vars\n",
+    "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
     "\n",
     "# Customizer env vars\n",
     "os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n",
@@ -50,16 +106,16 @@
     "os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n",
     "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
     "\n",
-    "# Guardrails env vars\n",
-    "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
-    "\n",
     "# Evaluator env vars\n",
-    "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
+    "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n",
+    "\n",
+    "# Guardrails env vars\n",
+    "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -70,14 +126,14 @@
     "from time import sleep, time\n",
     "from typing import Dict\n",
     "\n",
-    "# import aiohttp\n",
-    "# import requests\n",
-    "# from huggingface_hub import HfApi\n",
+    "import aiohttp\n",
+    "import requests\n",
+    "from huggingface_hub import HfApi\n",
     "\n",
-    "# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
-    "# os.environ[\"HF_TOKEN\"] = \"token\"\n",
+    "os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
+    "os.environ[\"HF_TOKEN\"] = \"token\"\n",
     "\n",
-    "# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
+    "hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
    ]
   },
   {
@@ -115,9 +171,10 @@
     "    response = client.post_training.job.status(job_uuid=job_id)\n",
     "    job_status = response.status\n",
     "\n",
+    "    print(f\"Waiting for Customization job {job_id} to finish.\")\n",
     "    print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
     "\n",
-    "    while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
+    "    while job_status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n",
     "        sleep(polling_interval)\n",
     "        response = client.post_training.job.status(job_uuid=job_id)\n",
     "        job_status = response.status\n",
@@ -133,9 +190,10 @@
     "    start_time = time()\n",
     "    job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
     "\n",
+    "    print(f\"Waiting for Evaluation job {job_id} to finish.\")\n",
     "    print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
     "\n",
-    "    while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
+    "    while job_status.status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n",
     "        sleep(polling_interval)\n",
     "        job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
     "\n",
@@ -144,14 +202,49 @@
     "        if time() - start_time > timeout:\n",
     "            raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
     "\n",
-    "    return job_status\n"
+    "    return job_status\n",
+    "\n",
+    "def wait_nim_loads_customized_model(model_id: str, namespace: str, polling_interval: int = 10, timeout: int = 300):\n",
+    "    found = False\n",
+    "    start_time = time()\n",
+    "\n",
+    "    model_path = f\"{namespace}/{model_id}\"\n",
+    "    print(f\"Checking if NIM has loaded customized model {model_path}.\")\n",
+    "\n",
+    "    while not found:\n",
+    "        sleep(polling_interval)\n",
+    "\n",
+    "        response = requests.get(f\"{NIM_URL}/v1/models\")\n",
+    "        if model_path in [model[\"id\"] for model in response.json()[\"data\"]]:\n",
+    "            found = True\n",
+    "            print(f\"Model {model_path} available after {time() - start_time} seconds.\")\n",
+    "            break\n",
+    "        else:\n",
+    "            print(f\"Model {model_path} not available after {time() - start_time} seconds.\")\n",
+    "\n",
+    "    if not found:\n",
+    "        raise RuntimeError(f\"Model {model_path} not available after {timeout} seconds.\")\n",
+    "\n",
+    "    assert found, f\"Could not find model {model_path} in the list of available models.\"\n",
+    "            "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TODO: Upload Dataset Using the HuggingFace Client"
+    "## Upload Dataset Using the HuggingFace Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_squad_test_dataset_name = \"squad-test-dataset\"\n",
+    "namespace = \"default\"\n",
+    "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
    ]
   },
   {
@@ -159,20 +252,9 @@
    "execution_count": 6,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
-    "namespace = \"default\"\n",
-    "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "# Create the repo\n",
-    "# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
+    "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")"
    ]
   },
   {
@@ -182,24 +264,24 @@
    "outputs": [],
    "source": [
     "# Upload the files from the local folder\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_data/training\",\n",
-    "#     path_in_repo=\"training\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_data/validation\",\n",
-    "#     path_in_repo=\"validation\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_data/testing\",\n",
-    "#     path_in_repo=\"testing\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )"
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_data/training\",\n",
+    "    path_in_repo=\"training\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")\n",
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_data/validation\",\n",
+    "    path_in_repo=\"validation\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")\n",
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_data/testing\",\n",
+    "    path_in_repo=\"testing\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")"
    ]
   },
   {
@@ -209,7 +291,18 @@
    "outputs": [],
    "source": [
     "# Create the dataset\n",
-    "# response = client.datasets.register(...)"
+    "# response = client.datasets.register(...)\n",
+    "response = requests.post(\n",
+    "    url=f\"{NEMO_URL}/v1/datasets\",\n",
+    "    json={\n",
+    "        \"name\": sample_squad_test_dataset_name,\n",
+    "        \"namespace\": namespace,\n",
+    "        \"description\": \"Dataset created from llama-stack e2e notebook\",\n",
+    "        \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_test_dataset_name}\",\n",
+    "    },\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n",
+    "json.dumps(response.json(), indent=2)"
    ]
   },
   {
@@ -221,7 +314,14 @@
     "# Check the files URL\n",
     "# response = client.datasets.retrieve(repo_id)\n",
     "# dataset = response.model_dump()\n",
-    "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\""
+    "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n",
+    "response = requests.get(\n",
+    "    url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_test_dataset_name}\",\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n",
+    "dataset_obj = response.json()\n",
+    "print(\"Files URL:\", dataset_obj[\"files_url\"])\n",
+    "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\""
    ]
   },
   {
@@ -276,22 +376,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Evaluation\n",
-    "TODO: Implement this section after Evalutor integration is done."
+    "## Evaluation\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "benchmark_id = \"jg-llama-stack-3\""
+    "benchmark_id = \"test-eval-config-1\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,36 +401,41 @@
     "    \"scoring_functions\": [],\n",
     "    \"metadata\": {\n",
     "        \"type\": \"custom\",\n",
-    "        \"params\": {\n",
-    "            \"parallelism\": 8\n",
-    "        },\n",
+    "        \"params\": {\"parallelism\": 8},\n",
     "        \"tasks\": {\n",
     "            \"qa\": {\n",
     "                \"type\": \"completion\",\n",
     "                \"params\": {\n",
     "                    \"template\": {\n",
     "                        \"prompt\": \"{{prompt}}\",\n",
-    "                        \"max_tokens\": 200\n",
-    "                    }\n",
-    "                },\n",
-    "                \"dataset\": {\n",
-    "                    \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
+    "                        \"max_tokens\": 20,\n",
+    "                        \"temperature\": 0.7,\n",
+    "                        \"top_p\": 0.9,\n",
+    "                    },\n",
     "                },\n",
+    "                \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n",
     "                \"metrics\": {\n",
     "                    \"bleu\": {\n",
     "                        \"type\": \"bleu\",\n",
-    "                        \"params\": {\n",
-    "                            \"references\": [\n",
-    "                                \"{{ideal_response}}\"\n",
-    "                            ]\n",
-    "                        }\n",
-    "                    }\n",
-    "                }\n",
+    "                        \"params\": {\"references\": [\"{{ideal_response}}\"]},\n",
+    "                    },\n",
+    "                    \"string-check\": {\n",
+    "                        \"type\": \"string-check\",\n",
+    "                        \"params\": {\"check\": [\"{{ideal_response | trim}}\", \"equals\", \"{{output_text | trim}}\"]},\n",
+    "                    },\n",
+    "                },\n",
     "            }\n",
     "        }\n",
     "    }\n",
-    "}\n",
-    "\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "response = client.benchmarks.register(\n",
     "    benchmark_id=benchmark_id,\n",
     "    dataset_id=repo_id,\n",
@@ -347,32 +451,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for benchmark in client.benchmarks.list():\n",
-    "    print(benchmark)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Launch a simple evaluation with the benchmark\n",
+    "# Launch a simple evaluation with the benchmark\n",
     "response = client.eval.run_eval(\n",
     "    benchmark_id=benchmark_id,\n",
     "    benchmark_config={\n",
     "        \"eval_candidate\": {\n",
     "            \"type\": \"model\",\n",
-    "            \"model\": \"meta/llama-3.1-8b-instruct\",\n",
-    "            \"sampling_params\": {\n",
-    "                \"strategy\": {\n",
-    "                    \"type\": \"top_p\",\n",
-    "                    \"temperature\": 1.0,\n",
-    "                    \"top_p\": 0.95,\n",
-    "                },\n",
-    "                \"max_tokens\": 4096,\n",
-    "                \"repeat_penalty\": 1.0,\n",
-    "            },\n",
+    "            \"model\": \"meta/llama-3.1-8b-instruct\"\n",
     "        }\n",
     "    }\n",
     ")\n",
@@ -406,7 +491,7 @@
    "outputs": [],
    "source": [
     "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
-    "print(f\"Job results: {job_results.model_dump()}\")"
+    "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
    ]
   },
   {
@@ -416,7 +501,7 @@
    "outputs": [],
    "source": [
     "# Extract bleu score and assert it's within range\n",
-    "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
+    "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
     "print(f\"Initial bleu score: {initial_bleu_score}\")\n",
     "\n",
     "assert initial_bleu_score >= 2"
@@ -429,10 +514,10 @@
    "outputs": [],
    "source": [
     "# Extract accuracy and assert it's within range\n",
-    "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+    "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
     "print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
     "\n",
-    "assert initial_accuracy_score >= 0.5"
+    "assert initial_accuracy_score >= 0"
    ]
   },
   {
@@ -507,7 +592,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Verify that inference with the new model works\n",
+    "# Check that inference with the new model works\n",
     "from llama_stack.apis.models.models import ModelType\n",
     "\n",
     "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
@@ -525,7 +610,20 @@
     "#     sampling_params={\n",
     "#         \"max_tokens\": 50,\n",
     "#     },\n",
-    "# )"
+    "# )\n",
+    "\n",
+    "res = requests.post(\n",
+    "    url=f\"{NIM_URL}/v1/completions\",\n",
+    "    json={\n",
+    "        \"model\": f\"{namespace}/{CUSTOMIZED_MODEL_DIR}\",\n",
+    "        \"prompt\": sample_prompt,\n",
+    "        \"max_tokens\": 20,\n",
+    "        \"temperature\": 0.7,\n",
+    "        \"top_p\": 0.9,\n",
+    "    },\n",
+    ")\n",
+    "assert res.status_code in (200, 201), f\"Status Code {res.status_code} Failed to get adapted model completion {res.text}\"\n",
+    "json.dumps(res.json(), indent=2)"
    ]
   },
   {
@@ -533,37 +631,37 @@
    "metadata": {},
    "source": [
     "## TODO: Evaluate Customized Model\n",
-    "Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
+    "Implement this section after we can register Customized model in Model Registry."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TODO: Upload Chat Dataset\n",
-    "Implement this section after Data Store integration is done.\n",
+    "## Upload Chat Dataset\n",
     "Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
-    "sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n",
+    "sample_squad_messages_dataset_name = \"test-squad-messages-dataset\"\n",
     "namespace = \"default\"\n",
     "repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Create the repo\n",
-    "# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
+    "# hf_api.create_repo(repo_id, repo_type=\"dataset\")\n",
+    "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")"
    ]
   },
   {
@@ -573,24 +671,24 @@
    "outputs": [],
    "source": [
     "# Upload the files from the local folder\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_messages/training\",\n",
-    "#     path_in_repo=\"training\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_messages/validation\",\n",
-    "#     path_in_repo=\"validation\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )\n",
-    "# hf_api.upload_folder(\n",
-    "#     folder_path=\"./tmp/sample_squad_messages/testing\",\n",
-    "#     path_in_repo=\"testing\",\n",
-    "#     repo_id=repo_id,\n",
-    "#     repo_type=\"dataset\",\n",
-    "# )"
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_messages/training\",\n",
+    "    path_in_repo=\"training\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")\n",
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_messages/validation\",\n",
+    "    path_in_repo=\"validation\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")\n",
+    "hf_api.upload_folder(\n",
+    "    folder_path=\"./tmp/sample_squad_messages/testing\",\n",
+    "    path_in_repo=\"testing\",\n",
+    "    repo_id=repo_id,\n",
+    "    repo_type=\"dataset\",\n",
+    ")"
    ]
   },
   {
@@ -600,7 +698,38 @@
    "outputs": [],
    "source": [
     "# Create the dataset\n",
-    "# response = client.datasets.register(...)"
+    "# response = client.datasets.register(...)\n",
+    "response = requests.post(\n",
+    "    url=f\"{NEMO_URL}/v1/datasets\",\n",
+    "    json={\n",
+    "        \"name\": sample_squad_messages_dataset_name,\n",
+    "        \"namespace\": namespace,\n",
+    "        \"description\": \"Dataset created from llama-stack e2e notebook\",\n",
+    "        \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n",
+    "        \"project\": \"default/project-7tLfD8Lt59wFbarFceF3xN\",\n",
+    "    },\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n",
+    "json.dumps(response.json(), indent=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check the files URL\n",
+    "# response = client.datasets.retrieve(repo_id)\n",
+    "# dataset = response.model_dump()\n",
+    "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n",
+    "response = requests.get(\n",
+    "    url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n",
+    "dataset_obj = response.json()\n",
+    "print(\"Files URL:\", dataset_obj[\"files_url\"])\n",
+    "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\""
    ]
   },
   {
@@ -651,8 +780,151 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Evaluate with chat dataset\n",
-    "TODO: Implement this section after Evalutor integration is done."
+    "## Evaluate with chat dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "benchmark_id = \"test-eval-config-chat-1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Register a benchmark, which creates an Eval Config\n",
+    "simple_eval_config = {\n",
+    "    \"benchmark_id\": benchmark_id,\n",
+    "    \"dataset_id\": \"\",\n",
+    "    \"scoring_functions\": [],\n",
+    "    \"metadata\": {\n",
+    "        \"type\": \"custom\",\n",
+    "        \"params\": {\"parallelism\": 8},\n",
+    "        \"tasks\": {\n",
+    "            \"qa\": {\n",
+    "                \"type\": \"completion\",\n",
+    "                \"params\": {\n",
+    "                    \"template\": {\n",
+    "                        \"messages\": [\n",
+    "                            {\"role\": \"{{item.messages[0].role}}\", \"content\": \"{{item.messages[0].content}}\"},\n",
+    "                            {\"role\": \"{{item.messages[1].role}}\", \"content\": \"{{item.messages[1].content}}\"},\n",
+    "                        ],\n",
+    "                        \"max_tokens\": 20,\n",
+    "                        \"temperature\": 0.7,\n",
+    "                        \"top_p\": 0.9,\n",
+    "                    },\n",
+    "                },\n",
+    "                \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n",
+    "                \"metrics\": {\n",
+    "                    \"bleu\": {\n",
+    "                        \"type\": \"bleu\",\n",
+    "                        \"params\": {\"references\": [\"{{item.messages[2].content | trim}}\"]},\n",
+    "                    },\n",
+    "                    \"string-check\": {\n",
+    "                        \"type\": \"string-check\",\n",
+    "                        \"params\": {\"check\": [\"{{item.messages[2].content}}\", \"equals\", \"{{output_text | trim}}\"]},\n",
+    "                    },\n",
+    "                },\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.benchmarks.register(\n",
+    "    benchmark_id=benchmark_id,\n",
+    "    dataset_id=repo_id,\n",
+    "    scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
+    "    metadata=simple_eval_config[\"metadata\"]\n",
+    ")\n",
+    "print(f\"Created benchmark {benchmark_id}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Launch a simple evaluation with the benchmark\n",
+    "response = client.eval.run_eval(\n",
+    "    benchmark_id=benchmark_id,\n",
+    "    benchmark_config={\n",
+    "        \"eval_candidate\": {\n",
+    "            \"type\": \"model\",\n",
+    "            \"model\": \"meta/llama-3.1-8b-instruct\",\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "job_id = response.model_dump()[\"job_id\"]\n",
+    "print(f\"Created evaluation job {job_id}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wait for the job to complete\n",
+    "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Job {job_id} status: {job.status}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
+    "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract bleu score and assert it's within range\n",
+    "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+    "print(f\"Initial bleu score: {initial_bleu_score}\")\n",
+    "\n",
+    "assert initial_bleu_score >= 12"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract accuracy and assert it's within range\n",
+    "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
+    "print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
+    "\n",
+    "assert initial_accuracy_score >= 0.2"
    ]
   },
   {
@@ -668,13 +940,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "customized_model_name = \"messages-example-model\"\n",
-    "customized_model_version = \"v2\"\n",
+    "customized_model_name = \"test-messages-model\"\n",
+    "customized_model_version = \"v1\"\n",
     "customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n",
     "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n",
     "\n",
-    "# TODO: We need to re-initialize the client here to pick up the new env vars\n",
-    "# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n",
+    "# NOTE: We need to re-initialize the client here so the Post Training API pick up the updated env var\n",
     "client.initialize()"
    ]
   },
@@ -717,12 +988,211 @@
     "print(f\"Created job with ID: {job_id}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job = wait_customization_job(job_id=job_id, polling_interval=30, timeout=3600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+    "# client.models.register(\n",
+    "#     model_id=CUSTOMIZED_MODEL_DIR,\n",
+    "#     model_type=ModelType.llm,\n",
+    "#     provider_id=\"nvidia\",\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check that the customized model has been picked up by NIM;\n",
+    "# We allow up to 5 minutes for the LoRA adapter to be loaded\n",
+    "wait_nim_loads_customized_model(model_id=customized_model_dir, namespace=namespace)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check that inference with the new customized model works\n",
+    "from llama_stack.apis.models.models import ModelType\n",
+    "\n",
+    "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+    "# client.models.register(\n",
+    "#     model_id=customized_model_dir,\n",
+    "#     model_type=ModelType.llm,\n",
+    "#     provider_id=\"nvidia\",\n",
+    "# )\n",
+    "\n",
+    "# TODO: This won't work until the code above works - errors with model_id not found.\n",
+    "# response = client.inference.completion(\n",
+    "#     content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
+    "#     stream=False,\n",
+    "#     model_id=f\"default/{customized_model_dir}\",\n",
+    "#     sampling_params={\n",
+    "#         \"max_tokens\": 50,\n",
+    "#     },\n",
+    "# )\n",
+    "\n",
+    "# TODO: Remove this once code above works. Until then, we'll directly call NIM.\n",
+    "response = requests.post(\n",
+    "    url=f\"{NIM_URL}/v1/chat/completions\",\n",
+    "    json={\n",
+    "        \"model\": f\"{namespace}/{customized_model_dir}\",\n",
+    "        \"messages\": sample_messages,\n",
+    "        \"max_tokens\": 20,\n",
+    "        \"temperature\": 0.7,\n",
+    "        \"top_p\": 0.9,\n",
+    "    },\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to get adapted model completion {response.text}\"\n",
+    "response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert len(response.json()[\"choices\"][0][\"message\"][\"content\"]) > 1"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TODO: Evaluate Customized Model with chat dataset\n",
-    "Implement this section after Evalutor integration is done."
+    "## Evaluate Customized Model with chat dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Launch evaluation for customized model\n",
+    "\n",
+    "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+    "# response = client.eval.run_eval(\n",
+    "#     benchmark_id=benchmark_id,\n",
+    "#     benchmark_config={\n",
+    "#         \"eval_candidate\": {\n",
+    "#             \"type\": \"model\",\n",
+    "#             \"model\": \"meta/llama-3.1-8b-instruct\",\n",
+    "#             \"model\": {\n",
+    "#                 \"api_endpoint\": {\n",
+    "#                     \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n",
+    "#                     \"model_id\": f\"{namespace}/{customized_model_dir}\",\n",
+    "#                 }\n",
+    "#             },\n",
+    "#         }\n",
+    "#     }\n",
+    "# )\n",
+    "# job_id = response.model_dump()[\"job_id\"]\n",
+    "# print(f\"Created evaluation job {job_id}\")\n",
+    "\n",
+    "# TODO: Remove this once code above works. Until then, we'll directly call the Eval API.\n",
+    "response = requests.post(\n",
+    "    f\"{NEMO_URL}/v1/evaluation/jobs\",\n",
+    "    json={\n",
+    "        \"config\": f\"nvidia/{benchmark_id}\",\n",
+    "        \"target\": {\n",
+    "            \"type\": \"model\",\n",
+    "            \"model\": {\n",
+    "                \"api_endpoint\": {\n",
+    "                    \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n",
+    "                    \"model_id\": f\"{namespace}/{customized_model_dir}\",\n",
+    "                }\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create new evaluation target {response.text}\"\n",
+    "response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_id = response.json()[\"id\"]\n",
+    "print(f\"Created evaluation job {job_id}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
+    "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract bleu score and assert it's within range\n",
+    "customized_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+    "print(f\"Customized bleu score: {customized_bleu_score}\")\n",
+    "\n",
+    "assert customized_bleu_score >= 40"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract accuracy and assert it's within range\n",
+    "customized_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
+    "print(f\"Customized accuracy: {customized_accuracy_score}\")\n",
+    "\n",
+    "assert customized_accuracy_score >= 0.47"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure the customized model evaluation is better than the original model evaluation\n",
+    "print(f\"customized_bleu_score - initial_bleu_score: {customized_bleu_score - initial_bleu_score}\")\n",
+    "assert (customized_bleu_score - initial_bleu_score) >= 20\n",
+    "\n",
+    "print(f\"customized_accuracy_score - initial_accuracy_score: {customized_accuracy_score - initial_accuracy_score}\")\n",
+    "assert (customized_accuracy_score - initial_accuracy_score) >= 0.2"
    ]
   },
   {
@@ -757,8 +1227,7 @@
    "outputs": [],
    "source": [
     "# Check inference with guardrails\n",
-    "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
-    "message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n",
+    "message = {\"role\": \"role\", \"content\": \"You are stupid.\"}\n",
     "response = client.safety.run_shield(\n",
     "    messages=[message],\n",
     "    shield_id=shield_id,\n",
@@ -769,16 +1238,14 @@
     ")\n",
     "\n",
     "print(f\"Safety response: {response}\")\n",
-    "# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n",
-    "# assert response.user_message == \"Sorry I cannot do this.\""
+    "assert response.user_message == \"Sorry I cannot do this.\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TODO: Guardrails Evaluation\n",
-    "TODO: Implement this section after Evalutor integration is done."
+    "## Guardrails Evaluation\n"
    ]
   }
  ],
@@ -798,7 +1265,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.10.2"
   }
  },
  "nbformat": 4,
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 0e5959c37..f9bb9a171 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -27,8 +27,8 @@ class ModelCandidate(BaseModel):
     """
 
     type: Literal["model"] = "model"
-    model: str
-    sampling_params: SamplingParams
+    model: Union[str, Dict[str, Any]]
+    sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
     system_message: Optional[SystemMessage] = None
 
 
diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py
index 2ef46251e..ca8464f51 100644
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@@ -48,13 +48,13 @@ class NVIDIAEvalImpl(
 
     async def _evaluator_get(self, path):
         """Helper for making GET requests to the evaluator service."""
-        response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
+        response = requests.get(url=f"{self.config.evaluator_service_url}{path}")
         response.raise_for_status()
         return response.json()
 
     async def _evaluator_post(self, path, data):
         """Helper for making POST requests to the evaluator service."""
-        response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
+        response = requests.post(url=f"{self.config.evaluator_service_url}{path}", json=data)
         response.raise_for_status()
         return response.json()
 
diff --git a/llama_stack/providers/remote/post_training/nvidia/post_training.py b/llama_stack/providers/remote/post_training/nvidia/post_training.py
index b3653b527..7594d5554 100644
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@@ -408,7 +408,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
                     if v is not None
                 }
             else:
-                raise NotImplementedError(f"JASH was here Unsupported algorithm config: {algorithm_config}")
+                raise NotImplementedError(f"Unsupported algorithm config: {algorithm_config}")
 
         # Create the customization job
         response = await self._make_request(