In-progress: e2e notebook with partial Eval integration

2025-12-18 02:19:49 +00:00 · 2025-04-08 14:08:01 -04:00 · 2025-04-08 14:08:01 -04:00 · c04ab0133d
commit c04ab0133d
parent 861962fa80
19 changed files with 832 additions and 624 deletions
--- a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -31,9 +31,9 @@
    "import os\n",
    "\n",
    "# NVIDIA URLs\n",
-    "NDS_URL = \"\"\n",
+    "NDS_URL = \"https://datastore.int.aire.nvidia.com\"\n",
-    "NEMO_URL = \"\"\n",
+    "NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
-    "NIM_URL = \"\"\n",
+    "NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
    "\n",
    "# Inference env vars\n",
    "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
@ -51,12 +51,15 @@
    "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
    "\n",
    "# Guardrails env vars\n",
-    "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL"
+    "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
    "\n",
    "# Evaluator env vars\n",
    "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -67,14 +70,14 @@
    "from time import sleep, time\n",
    "from typing import Dict\n",
    "\n",
-    "import aiohttp\n",
+    "# import aiohttp\n",
-    "import requests\n",
+    "# import requests\n",
-    "from huggingface_hub import HfApi\n",
+    "# from huggingface_hub import HfApi\n",
    "\n",
-    "os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
+    "# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
-    "os.environ[\"HF_TOKEN\"] = \"token\"\n",
+    "# os.environ[\"HF_TOKEN\"] = \"token\"\n",
    "\n",
-    "hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
+    "# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
   ]
  },
  {
@ -87,546 +90,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: `bwrap` is not available. Code interpreter tool will not work correctly.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing NVIDIASafetyAdapter(https://nmp.int.aire.nvidia.com)...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Using config <span style=\"color: #000080; text-decoration-color: #000080\">nvidia</span>:\n",
       "</pre>\n"
      ],
      "text/plain": [
       "Using config \u001b[34mnvidia\u001b[0m:\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">apis:\n",
       "- agents\n",
       "- datasetio\n",
       "- eval\n",
       "- inference\n",
       "- post_training\n",
       "- safety\n",
       "- scoring\n",
       "- telemetry\n",
       "- tool_runtime\n",
       "- vector_io\n",
       "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
       "container_image: null\n",
       "datasets: <span style=\"font-weight: bold\">[]</span>\n",
       "image_name: nvidia\n",
       "logging: null\n",
       "metadata_store:\n",
       "  db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
       "  namespace: null\n",
       "  type: sqlite\n",
       "models:\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama3-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama3-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-FP8\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
       "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
       "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
       "- metadata:\n",
       "    context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8192</span>\n",
       "    embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2048</span>\n",
       "  model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
       "- metadata:\n",
       "    context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
       "    embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
       "  model_id: nvidia/nv-embedqa-e5-v5\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-e5-v5\n",
       "- metadata:\n",
       "    context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
       "    embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4096</span>\n",
       "  model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "- metadata:\n",
       "    context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
       "    embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
       "  model_id: snowflake/arctic-embed-l\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: snowflake/arctic-embed-l\n",
       "providers:\n",
       "  agents:\n",
       "  - config:\n",
       "      persistence_store:\n",
       "        db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">agents_store.db</span>\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  datasetio:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">localfs_datasetio.db</span>\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: localfs\n",
       "    provider_type: inline::localfs\n",
       "  eval:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">meta_reference_eval.db</span>\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  inference:\n",
       "  - config:\n",
       "      api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
       "      url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nim.int.aire.nvidia.com</span>\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  post_training:\n",
       "  - config:\n",
       "      api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
       "      customizer_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
       "      dataset_namespace: default\n",
       "      project_id: test-project\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  safety:\n",
       "  - config:\n",
       "      config_id: self-check\n",
       "      guardrails_service_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  scoring:\n",
       "  - config: <span style=\"font-weight: bold\">{}</span>\n",
       "    provider_id: basic\n",
       "    provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::ba</span>sic\n",
       "  telemetry:\n",
       "  - config:\n",
       "      service_name: <span style=\"color: #008000; text-decoration-color: #008000\">\"\\u200B\"</span>\n",
       "      sinks: sqlite\n",
       "      sqlite_db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">trace_store.db</span>\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  tool_runtime:\n",
       "  - config: <span style=\"font-weight: bold\">{}</span>\n",
       "    provider_id: rag-runtime\n",
       "    provider_type: inline::rag-runtime\n",
       "  vector_io:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">faiss_store.db</span>\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: faiss\n",
       "    provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::fa</span>iss\n",
       "scoring_fns: <span style=\"font-weight: bold\">[]</span>\n",
       "server:\n",
       "  auth: null\n",
       "  port: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8321</span>\n",
       "  tls_certfile: null\n",
       "  tls_keyfile: null\n",
       "shields: <span style=\"font-weight: bold\">[]</span>\n",
       "tool_groups:\n",
       "- args: null\n",
       "  mcp_endpoint: null\n",
       "  provider_id: rag-runtime\n",
       "  toolgroup_id: builtin::rag\n",
       "vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
       "version: <span style=\"color: #008000; text-decoration-color: #008000\">'2'</span>\n",
       "\n",
       "</pre>\n"
      ],
      "text/plain": [
       "apis:\n",
       "- agents\n",
       "- datasetio\n",
       "- eval\n",
       "- inference\n",
       "- post_training\n",
       "- safety\n",
       "- scoring\n",
       "- telemetry\n",
       "- tool_runtime\n",
       "- vector_io\n",
       "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
       "container_image: null\n",
       "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
       "image_name: nvidia\n",
       "logging: null\n",
       "metadata_store:\n",
       "  db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
       "  namespace: null\n",
       "  type: sqlite\n",
       "models:\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama3-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama3-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-1B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
       "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
       "- metadata:\n",
       "    context_length: \u001b[1;36m8192\u001b[0m\n",
       "    embedding_dimension: \u001b[1;36m2048\u001b[0m\n",
       "  model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
       "- metadata:\n",
       "    context_length: \u001b[1;36m512\u001b[0m\n",
       "    embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
       "  model_id: nvidia/nv-embedqa-e5-v5\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-e5-v5\n",
       "- metadata:\n",
       "    context_length: \u001b[1;36m512\u001b[0m\n",
       "    embedding_dimension: \u001b[1;36m4096\u001b[0m\n",
       "  model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "- metadata:\n",
       "    context_length: \u001b[1;36m512\u001b[0m\n",
       "    embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
       "  model_id: snowflake/arctic-embed-l\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: snowflake/arctic-embed-l\n",
       "providers:\n",
       "  agents:\n",
       "  - config:\n",
       "      persistence_store:\n",
       "        db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95magents_store.db\u001b[0m\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  datasetio:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mlocalfs_datasetio.db\u001b[0m\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: localfs\n",
       "    provider_type: inline::localfs\n",
       "  eval:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mmeta_reference_eval.db\u001b[0m\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  inference:\n",
       "  - config:\n",
       "      api_key: \u001b[32m'********'\u001b[0m\n",
       "      url: \u001b[4;94mhttps://nim.int.aire.nvidia.com\u001b[0m\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  post_training:\n",
       "  - config:\n",
       "      api_key: \u001b[32m'********'\u001b[0m\n",
       "      customizer_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
       "      dataset_namespace: default\n",
       "      project_id: test-project\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  safety:\n",
       "  - config:\n",
       "      config_id: self-check\n",
       "      guardrails_service_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  scoring:\n",
       "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "    provider_id: basic\n",
       "    provider_type: inlin\u001b[1;92me::ba\u001b[0msic\n",
       "  telemetry:\n",
       "  - config:\n",
       "      service_name: \u001b[32m\"\\u200B\"\u001b[0m\n",
       "      sinks: sqlite\n",
       "      sqlite_db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mtrace_store.db\u001b[0m\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  tool_runtime:\n",
       "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
       "    provider_id: rag-runtime\n",
       "    provider_type: inline::rag-runtime\n",
       "  vector_io:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mfaiss_store.db\u001b[0m\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: faiss\n",
       "    provider_type: inlin\u001b[1;92me::fa\u001b[0miss\n",
       "scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
       "server:\n",
       "  auth: null\n",
       "  port: \u001b[1;36m8321\u001b[0m\n",
       "  tls_certfile: null\n",
       "  tls_keyfile: null\n",
       "shields: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
       "tool_groups:\n",
       "- args: null\n",
       "  mcp_endpoint: null\n",
       "  provider_id: rag-runtime\n",
       "  toolgroup_id: builtin::rag\n",
       "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
       "version: \u001b[32m'2'\u001b[0m\n",
       "\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
    "\n",
@ -634,6 +100,53 @@
    "client.initialize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper functions for waiting on jobs\n",
    "from llama_stack.apis.common.job_types import JobStatus\n",
    "\n",
    "def wait_customization_job(job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
    "    start_time = time()\n",
    "\n",
    "    response = client.post_training.job.status(job_uuid=job_id)\n",
    "    job_status = response.status\n",
    "\n",
    "    print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
    "\n",
    "    while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
    "        sleep(polling_interval)\n",
    "        response = client.post_training.job.status(job_uuid=job_id)\n",
    "        job_status = response.status\n",
    "\n",
    "        print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
    "\n",
    "        if time() - start_time > timeout:\n",
    "            raise RuntimeError(f\"Customization Job {job_id} took more than {timeout} seconds.\")\n",
    "        \n",
    "    return job_status\n",
    "\n",
    "def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
    "    start_time = time()\n",
    "    job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
    "\n",
    "    print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
    "\n",
    "    while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
    "        sleep(polling_interval)\n",
    "        job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
    "\n",
    "        print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
    "\n",
    "        if time() - start_time > timeout:\n",
    "            raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
    "\n",
    "    return job_status\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -643,11 +156,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
-    "sample_squad_test_dataset_name = \"jg-llama-stack\"\n",
+    "sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
    "namespace = \"default\"\n",
    "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
   ]
@ -767,12 +280,160 @@
    "TODO: Implement this section after Evalutor integration is done."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "benchmark_id = \"jg-llama-stack-3\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
    "# Register a benchmark, which creates an Evaluation Config\n",
    "simple_eval_config = {\n",
    "    \"benchmark_id\": benchmark_id,\n",
    "    \"dataset_id\": \"\",\n",
    "    \"scoring_functions\": [],\n",
    "    \"metadata\": {\n",
    "        \"type\": \"custom\",\n",
    "        \"params\": {\n",
    "            \"parallelism\": 8\n",
    "        },\n",
    "        \"tasks\": {\n",
    "            \"qa\": {\n",
    "                \"type\": \"completion\",\n",
    "                \"params\": {\n",
    "                    \"template\": {\n",
    "                        \"prompt\": \"{{prompt}}\",\n",
    "                        \"max_tokens\": 200\n",
    "                    }\n",
    "                },\n",
    "                \"dataset\": {\n",
    "                    \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
    "                },\n",
    "                \"metrics\": {\n",
    "                    \"bleu\": {\n",
    "                        \"type\": \"bleu\",\n",
    "                        \"params\": {\n",
    "                            \"references\": [\n",
    "                                \"{{ideal_response}}\"\n",
    "                            ]\n",
    "                        }\n",
    "                    }\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "response = client.benchmarks.register(\n",
    "    benchmark_id=benchmark_id,\n",
    "    dataset_id=repo_id,\n",
    "    scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
    "    metadata=simple_eval_config[\"metadata\"]\n",
    ")\n",
    "print(f\"Created benchmark {benchmark_id}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for benchmark in client.benchmarks.list():\n",
    "    print(benchmark)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Launch a simple evaluation with the benchmark\n",
    "response = client.eval.run_eval(\n",
    "    benchmark_id=benchmark_id,\n",
    "    benchmark_config={\n",
    "        \"eval_candidate\": {\n",
    "            \"type\": \"model\",\n",
    "            \"model\": \"meta/llama-3.1-8b-instruct\",\n",
    "            \"sampling_params\": {\n",
    "                \"strategy\": {\n",
    "                    \"type\": \"top_p\",\n",
    "                    \"temperature\": 1.0,\n",
    "                    \"top_p\": 0.95,\n",
    "                },\n",
    "                \"max_tokens\": 4096,\n",
    "                \"repeat_penalty\": 1.0,\n",
    "            },\n",
    "        }\n",
    "    }\n",
    ")\n",
    "job_id = response.model_dump()[\"job_id\"]\n",
    "print(f\"Created evaluation job {job_id}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wait for the job to complete\n",
    "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Job {job_id} status: {job.status}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
    "print(f\"Job results: {job_results.model_dump()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract bleu score and assert it's within range\n",
    "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
    "print(f\"Initial bleu score: {initial_bleu_score}\")\n",
    "\n",
    "assert initial_bleu_score >= 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract accuracy and assert it's within range\n",
    "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
    "print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
    "\n",
    "assert initial_accuracy_score >= 0.5"
   ]
  },
  {
   "cell_type": "markdown",
@ -827,11 +488,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Customized model isn't available in the list of models, so this check doesn't work.\n",
+    "# Wait for the job to complete\n",
-    "# customized_model = client.models.retrieve(f\"{NAMESPACE}/{CUSTOMIZED_MODEL_DIR}\")\n",
+    "job_status = wait_customization_job(job_id=job_id)"
-    "# assert customized_model is not None\n",
+   ]
-    "job_status = client.post_training.job.status(job_uuid=job_id)\n",
+  },
-    "print(f\"Job status: {job_status.status}\")"
+  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Job {job_id} status: {job_status}\")"
   ]
  },
  {
@ -840,10 +507,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: This doesn't work - errors with model_id not found.\n",
    "# Depends on https://github.com/meta-llama/llama-stack/pull/1859\n",
    "# Verify that inference with the new model works\n",
    "\n",
    "from llama_stack.apis.models.models import ModelType\n",
    "\n",
    "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
@ -853,14 +517,15 @@
    "#     provider_id=\"nvidia\",\n",
    "# )\n",
    "\n",
-    "response = client.inference.completion(\n",
+    "# TODO: This won't work until the code above works - errors with model_id not found.\n",
-    "    content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
+    "# response = client.inference.completion(\n",
-    "    stream=False,\n",
+    "#     content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
-    "    model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
+    "#     stream=False,\n",
-    "    sampling_params={\n",
+    "#     model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
-    "        \"max_tokens\": 50,\n",
+    "#     sampling_params={\n",
-    "    },\n",
+    "#         \"max_tokens\": 50,\n",
-    ")"
+    "#     },\n",
    "# )"
   ]
  },
  {
@ -868,7 +533,7 @@
   "metadata": {},
   "source": [
    "## TODO: Evaluate Customized Model\n",
-    "Implement this section after Evalutor integration is done."
+    "Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
   ]
  },
  {
@ -1078,39 +743,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "data": {
      "text/plain": [
       "Shield(identifier='self-check', provider_id='nvidia', provider_resource_id='self-check', type='shield', params={}, access_attributes=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'model': 'self-check', 'messages': [{'role': 'system', 'content': 'You are stupid.'}], 'temperature': 1.0, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'max_tokens': 160, 'stream': False, 'guardrails': {'config_id': 'self-check'}}\n",
      "{'status': 'success', 'rails_status': {'self check input': {'status': 'success'}}, 'guardrails_data': {'log': {'activated_rails': [], 'stats': {'total_duration': 0.0007240772247314453, 'llm_calls_duration': 0.0, 'llm_calls_count': 0, 'llm_calls_total_prompt_tokens': 0, 'llm_calls_total_completion_tokens': 0, 'llm_calls_total_tokens': 0}}}}\n",
      "Safety response: RunShieldResponse(violation=None)\n"
     ]
    }
   ],
   "source": [
    "# Check inference with guardrails\n",
    "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
@ -1154,7 +798,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.2"
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -7,7 +7,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `inline::localfs` |
-| eval | `inline::meta-reference` |
+| eval | `remote::nvidia` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
@ -29,6 +29,7 @@ The following environment variables can be configured:
 - `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
 - `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
 - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
 - `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -6,7 +6,7 @@
 from typing import List
-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
 def available_providers() -> List[ProviderSpec]:
@ -25,4 +25,22 @@ def available_providers() -> List[ProviderSpec]:
                Api.agents,
            ],
        ),
        remote_provider_spec(
            api=Api.eval,
            adapter=AdapterSpec(
                adapter_type="nvidia",
                pip_packages=[
                    "requests",
                ],
                module="llama_stack.providers.remote.eval.nvidia",
                config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
            ),
            api_dependencies=[
                Api.datasetio,
                Api.datasets,
                Api.scoring,
                Api.inference,
                Api.agents,
            ],
        ),
    ]
--- a/llama_stack/providers/remote/eval/init.py
+++ b/llama_stack/providers/remote/eval/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/llama_stack/providers/remote/eval/nvidia/README.md
+++ b/llama_stack/providers/remote/eval/nvidia/README.md
@ -0,0 +1,126 @@
 # NVIDIA NeMo Evaluator Eval Provider
 ## Overview
 For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
 Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
 ### Example for register an academic benchmark
 ```
 POST /eval/benchmarks
 ```
 ```json
 {
  "benchmark_id": "mmlu",
  "dataset_id": "",
  "scoring_functions": [],
  "metadata": {
    "type": "mmlu"
  }
 }
 ```
 ### Example for register a custom evaluation
 ```
 POST /eval/benchmarks
 ```
 ```json
 {
  "benchmark_id": "my-custom-benchmark",
  "dataset_id": "",
  "scoring_functions": [],
  "metadata": {
    "type": "custom",
    "params": {
      "parallelism": 8
    },
    "tasks": {
      "qa": {
        "type": "completion",
        "params": {
          "template": {
            "prompt": "{{prompt}}",
            "max_tokens": 200
          }
        },
        "dataset": {
          "files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
        },
        "metrics": {
          "bleu": {
            "type": "bleu",
            "params": {
              "references": [
                "{{ideal_response}}"
              ]
            }
          }
        }
      }
    }
  }
 }
 ```
 ### Example for triggering a benchmark/custom evaluation
 ```
 POST /eval/benchmarks/{benchmark_id}/jobs
 ```
 ```json
 {
  "benchmark_id": "my-custom-benchmark",
  "benchmark_config": {
    "eval_candidate": {
      "type": "model",
      "model": "meta/llama-3.1-8b-instruct",
      "sampling_params": {
        "max_tokens": 100,
        "temperature": 0.7
      }
    },
    "scoring_params": {}
  }
 }
 ```
 Response example:
 ```json
 {
    "job_id": "1234",
    "status": "in_progress"
 }
 ```
 ### Example for getting the status of a job
 ```
 GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
 ```
 ### Example for cancelling a job
 ```
 POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
 ```
 ### Example for getting the results
 ```
 GET /eval/benchmarks/{benchmark_id}/results
 ```
 ```json
 {
  "generations": [],
  "scores": {
    "{benchmark_id}": {
      "score_rows": [],
      "aggregated_results": {
        "tasks": {},
        "groups": {}
      }
    }
  }
 }
 ```
--- a/llama_stack/providers/remote/eval/nvidia/init.py
+++ b/llama_stack/providers/remote/eval/nvidia/init.py
@ -0,0 +1,31 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict
 from llama_stack.distribution.datatypes import Api
 from .config import NVIDIAEvalConfig
 async def get_adapter_impl(
    config: NVIDIAEvalConfig,
    deps: Dict[Api, Any],
 ):
    from .eval import NVIDIAEvalImpl
    impl = NVIDIAEvalImpl(
        config,
        deps[Api.datasetio],
        deps[Api.datasets],
        deps[Api.scoring],
        deps[Api.inference],
        deps[Api.agents],
    )
    await impl.initialize()
    return impl
 __all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]
--- a/llama_stack/providers/remote/eval/nvidia/config.py
+++ b/llama_stack/providers/remote/eval/nvidia/config.py
@ -0,0 +1,29 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from typing import Any, Dict
 from pydantic import BaseModel, Field
 class NVIDIAEvalConfig(BaseModel):
    """
     Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
    Attributes:
        evaluator_service_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
    """
    evaluator_service_url: str = Field(
        default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
        description="The url for accessing the evaluator service",
    )
    @classmethod
    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
        return {
            "evaluator_service_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
        }
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -0,0 +1,147 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, List
 import requests
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.scoring import Scoring, ScoringResult
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from .....apis.common.job_types import Job, JobStatus
 from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
 from .config import NVIDIAEvalConfig
 DEFAULT_NAMESPACE = "nvidia"
 class NVIDIAEvalImpl(
    Eval,
    BenchmarksProtocolPrivate,
 ):
    def __init__(
        self,
        config: NVIDIAEvalConfig,
        datasetio_api: DatasetIO,
        datasets_api: Datasets,
        scoring_api: Scoring,
        inference_api: Inference,
        agents_api: Agents,
    ) -> None:
        self.config = config
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
        self.scoring_api = scoring_api
        self.inference_api = inference_api
        self.agents_api = agents_api
    async def initialize(self) -> None: ...
    async def shutdown(self) -> None: ...
    async def _evaluator_get(self, path):
        """Helper for making GET requests to the evaluator service."""
        response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
        response.raise_for_status()
        return response.json()
    async def _evaluator_post(self, path, data):
        """Helper for making POST requests to the evaluator service."""
        response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
        response.raise_for_status()
        return response.json()
    async def register_benchmark(self, task_def: Benchmark) -> None:
        """Register a benchmark as an evaluation configuration."""
        await self._evaluator_post(
            "/v1/evaluation/configs",
            {
                "namespace": DEFAULT_NAMESPACE,
                "name": task_def.benchmark_id,
                # metadata is copied to request body as-is
                **task_def.metadata,
            },
        )
    async def run_eval(
        self,
        benchmark_id: str,
        benchmark_config: BenchmarkConfig,
    ) -> Job:
        """Run an evaluation job for a benchmark."""
        model = (
            benchmark_config.eval_candidate.model
            if benchmark_config.eval_candidate.type == "model"
            else benchmark_config.eval_candidate.config.model
        )
        result = await self._evaluator_post(
            "/v1/evaluation/jobs",
            {
                "config": f"{DEFAULT_NAMESPACE}/{benchmark_id}",
                "target": {"type": "model", "model": model},
            },
        )
        return Job(job_id=result["id"], status=JobStatus.in_progress)
    async def evaluate_rows(
        self,
        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        benchmark_config: BenchmarkConfig,
    ) -> EvaluateResponse:
        raise NotImplementedError()
    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
        """Get the status of an evaluation job.
        EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
        JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
        """
        result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}")
        result_status = result["status"]
        job_status = JobStatus.failed
        if result_status in ["created", "pending"]:
            job_status = JobStatus.scheduled
        elif result_status in ["running"]:
            job_status = JobStatus.in_progress
        elif result_status in ["completed"]:
            job_status = JobStatus.completed
        elif result_status in ["cancelled"]:
            job_status = JobStatus.cancelled
        return Job(job_id=job_id, status=job_status)
    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        """Cancel the evaluation job."""
        await self._evaluator_post(f"/v1/evaluation/jobs/{job_id}/cancel", {})
    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
        """Returns the results of the evaluation job."""
        job = await self.job_status(benchmark_id, job_id)
        status = job.status
        if not status or status != JobStatus.completed:
            raise ValueError(f"Job {job_id} not completed. Status: {status.value}")
        result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}/results")
        return EvaluateResponse(
            # TODO: these are stored in detailed results on NeMo Evaluator side; can be added
            generations=[],
            scores={
                benchmark_id: ScoringResult(
                    score_rows=[],
                    aggregated_results=result,
                )
            },
        )
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -95,7 +95,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        for _ in range(self.config.max_retries):
            # TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
-            async with self.session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response:
+            async with self.session.request(
                method, url, params=params, json=json, verify_ssl=False, **kwargs
            ) as response:
                if response.status >= 400:
                    error_data = await response.json()
                    raise Exception(f"API request failed: {error_data}")
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@ -437,12 +437,10 @@
    "aiosqlite",
    "blobfile",
    "chardet",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -454,7 +452,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -462,7 +459,6 @@
    "sentencepiece",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn"
  ],
  "ollama": [
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@ -1,6 +1,6 @@
 version: '2'
 distribution_spec:
-  description: Use NVIDIA NIM for running LLM inference and safety
+  description: Use NVIDIA NIM for running LLM inference, evaluation and safety
  providers:
    inference:
    - remote::nvidia
@ -13,7 +13,7 @@ distribution_spec:
    telemetry:
    - inline::meta-reference
    eval:
-    - inline::meta-reference
+    - remote::nvidia
    post_training:
    - remote::nvidia
    datasetio:
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -7,6 +7,7 @@
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
 from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
 from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
@ -20,7 +21,7 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["remote::nvidia"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
+        "eval": ["remote::nvidia"],
        "post_training": ["remote::nvidia"],
        "datasetio": ["inline::localfs"],
        "scoring": ["inline::basic"],
@ -37,6 +38,11 @@ def get_distribution_template() -> DistributionTemplate:
        provider_type="remote::nvidia",
        config=NVIDIASafetyConfig.sample_run_config(),
    )
    eval_provider = Provider(
        provider_id="nvidia",
        provider_type="remote::nvidia",
        config=NVIDIAEvalConfig.sample_run_config(),
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="nvidia",
@ -60,7 +66,7 @@ def get_distribution_template() -> DistributionTemplate:
    return DistributionTemplate(
        name="nvidia",
        distro_type="remote_hosted",
-        description="Use NVIDIA NIM for running LLM inference and safety",
+        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
        container_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
@ -69,6 +75,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                    "eval": [eval_provider],
                },
                default_models=default_models,
                default_tool_groups=default_tool_groups,
@ -78,7 +85,8 @@ def get_distribution_template() -> DistributionTemplate:
                    "inference": [
                        inference_provider,
                        safety_provider,
-                    ]
+                    ],
                    "eval": [eval_provider],
                },
                default_models=[inference_model, safety_model],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
@ -119,6 +127,10 @@ def get_distribution_template() -> DistributionTemplate:
                "http://0.0.0.0:7331",
                "URL for the NeMo Guardrails Service",
            ),
            "NVIDIA_EVALUATOR_URL": (
                "http://0.0.0.0:7331",
                "URL for the NeMo Evaluator Service",
            ),
            "INFERENCE_MODEL": (
                "Llama3.1-8B-Instruct",
                "Inference model",
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -53,13 +53,10 @@ providers:
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  eval:
-  - provider_id: meta-reference
+  - provider_id: nvidia
-    provider_type: inline::meta-reference
+    provider_type: remote::nvidia
    config:
-      kvstore:
+      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -48,13 +48,10 @@ providers:
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  eval:
-  - provider_id: meta-reference
+  - provider_id: nvidia
-    provider_type: inline::meta-reference
+    provider_type: remote::nvidia
    config:
-      kvstore:
+      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/tests/unit/providers/nvidia/test_eval.py
+++ b/tests/unit/providers/nvidia/test_eval.py
@ -0,0 +1,203 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 import unittest
 from unittest.mock import MagicMock, patch
 import pytest
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.common.job_types import Job, JobStatus
 from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
 from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
 from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
 MOCK_DATASET_ID = "default/test-dataset"
 MOCK_BENCHMARK_ID = "test-benchmark"
 class TestNVIDIAEvalImpl(unittest.TestCase):
    def setUp(self):
        os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
        # Create mock APIs
        self.datasetio_api = MagicMock()
        self.datasets_api = MagicMock()
        self.scoring_api = MagicMock()
        self.inference_api = MagicMock()
        self.agents_api = MagicMock()
        self.config = NVIDIAEvalConfig(
            evaluator_service_url=os.environ["NVIDIA_EVALUATOR_URL"],
        )
        self.eval_impl = NVIDIAEvalImpl(
            config=self.config,
            datasetio_api=self.datasetio_api,
            datasets_api=self.datasets_api,
            scoring_api=self.scoring_api,
            inference_api=self.inference_api,
            agents_api=self.agents_api,
        )
        # Mock the HTTP request methods
        self.evaluator_get_patcher = patch(
            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
        )
        self.evaluator_post_patcher = patch(
            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
        )
        self.mock_evaluator_get = self.evaluator_get_patcher.start()
        self.mock_evaluator_post = self.evaluator_post_patcher.start()
        # Set up async test helper
        # self.run_async = self._create_async_helper()
    def tearDown(self):
        """Clean up after each test."""
        self.evaluator_get_patcher.stop()
        self.evaluator_post_patcher.stop()
    def _assert_request_body(self, expected_json):
        """Helper method to verify request body in Evaluator POST request is correct"""
        call_args = self.mock_evaluator_post.call_args
        actual_json = call_args[0][1]
        # Check that all expected keys contain the expected values in the actual JSON
        for key, value in expected_json.items():
            assert key in actual_json, f"Key '{key}' missing in actual JSON"
            if isinstance(value, dict):
                for nested_key, nested_value in value.items():
                    assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
                    assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
            else:
                assert actual_json[key] == value, f"Value mismatch for '{key}'"
    @pytest.fixture(autouse=True)
    def inject_fixtures(self, run_async):
        self.run_async = run_async
    def test_register_benchmark(self):
        eval_config = {
            "type": "custom",
            "params": {"parallelism": 8},
            "tasks": {
                "qa": {
                    "type": "completion",
                    "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
                    "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
                    "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
                }
            },
        }
        benchmark = Benchmark(
            provider_id="nvidia",
            type="benchmark",
            identifier=MOCK_BENCHMARK_ID,
            dataset_id=MOCK_DATASET_ID,
            scoring_functions=["basic::equality"],
            metadata=eval_config,
        )
        # Mock Evaluator API response
        mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
        self.mock_evaluator_post.return_value = mock_evaluator_response
        # Register the benchmark
        self.run_async(self.eval_impl.register_benchmark(benchmark))
        # Verify the Evaluator API was called correctly
        self.mock_evaluator_post.assert_called_once()
        self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
    def test_run_eval(self):
        benchmark_config = BenchmarkConfig(
            eval_candidate=ModelCandidate(
                type="model",
                model="meta/llama-3.1-8b-instruct",
                sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
            )
        )
        # Mock Evaluator API response
        mock_evaluator_response = {"id": "job-123", "status": "created"}
        self.mock_evaluator_post.return_value = mock_evaluator_response
        # Run the Evaluation job
        result = self.run_async(
            self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
        )
        # Verify the Evaluator API was called correctly
        self.mock_evaluator_post.assert_called_once()
        self._assert_request_body(
            {
                "config": f"nvidia/{MOCK_BENCHMARK_ID}",
                "target": {"type": "model", "model": benchmark_config.eval_candidate.model},
            }
        )
        # Verify the result
        assert isinstance(result, Job)
        assert result.job_id == "job-123"
        assert result.status == JobStatus.in_progress
    def test_job_status(self):
        # Mock Evaluator API response
        mock_evaluator_response = {"id": "job-123", "status": "completed"}
        self.mock_evaluator_get.return_value = mock_evaluator_response
        # Get the Evaluation job
        result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
        # Verify the result
        assert isinstance(result, Job)
        assert result.job_id == "job-123"
        assert result.status == JobStatus.completed
        # Verify the API was called correctly
        self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
    def test_job_cancel(self):
        # Mock Evaluator API response
        mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
        self.mock_evaluator_post.return_value = mock_evaluator_response
        # Cancel the Evaluation job
        self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
        # Verify the API was called correctly
        self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
    def test_job_result(self):
        # Mock Evaluator API responses
        mock_job_status_response = {"id": "job-123", "status": "completed"}
        mock_job_results_response = {
            "id": "job-123",
            "status": "completed",
            "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
        }
        self.mock_evaluator_get.side_effect = [
            mock_job_status_response,  # First call to retrieve job
            mock_job_results_response,  # Second call to retrieve job results
        ]
        # Get the Evaluation job results
        result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
        # Verify the result
        assert isinstance(result, EvaluateResponse)
        assert MOCK_BENCHMARK_ID in result.scores
        assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
        # Verify the API was called correctly
        assert self.mock_evaluator_get.call_count == 2
        self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
        self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")