In-progress: e2e notebook with partial Eval integration

This commit is contained in:
Jash Gulabrai 2025-04-08 14:08:01 -04:00
parent 861962fa80
commit c04ab0133d
19 changed files with 832 additions and 624 deletions

View file

@ -31,9 +31,9 @@
"import os\n", "import os\n",
"\n", "\n",
"# NVIDIA URLs\n", "# NVIDIA URLs\n",
"NDS_URL = \"\"\n", "NDS_URL = \"https://datastore.int.aire.nvidia.com\"\n",
"NEMO_URL = \"\"\n", "NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
"NIM_URL = \"\"\n", "NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
"\n", "\n",
"# Inference env vars\n", "# Inference env vars\n",
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n", "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
@ -51,12 +51,15 @@
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n", "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
"\n", "\n",
"# Guardrails env vars\n", "# Guardrails env vars\n",
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL" "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
"\n",
"# Evaluator env vars\n",
"os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -67,14 +70,14 @@
"from time import sleep, time\n", "from time import sleep, time\n",
"from typing import Dict\n", "from typing import Dict\n",
"\n", "\n",
"import aiohttp\n", "# import aiohttp\n",
"import requests\n", "# import requests\n",
"from huggingface_hub import HfApi\n", "# from huggingface_hub import HfApi\n",
"\n", "\n",
"os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n", "# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
"os.environ[\"HF_TOKEN\"] = \"token\"\n", "# os.environ[\"HF_TOKEN\"] = \"token\"\n",
"\n", "\n",
"hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))" "# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
] ]
}, },
{ {
@ -87,546 +90,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warning: `bwrap` is not available. Code interpreter tool will not work correctly.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initializing NVIDIASafetyAdapter(https://nmp.int.aire.nvidia.com)...\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Using config <span style=\"color: #000080; text-decoration-color: #000080\">nvidia</span>:\n",
"</pre>\n"
],
"text/plain": [
"Using config \u001b[34mnvidia\u001b[0m:\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">apis:\n",
"- agents\n",
"- datasetio\n",
"- eval\n",
"- inference\n",
"- post_training\n",
"- safety\n",
"- scoring\n",
"- telemetry\n",
"- tool_runtime\n",
"- vector_io\n",
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
"container_image: null\n",
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
"image_name: nvidia\n",
"logging: null\n",
"metadata_store:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
" namespace: null\n",
" type: sqlite\n",
"models:\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama3-8b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-8b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-8b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama3-70b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-70b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-70B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-70b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-FP8\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
"- metadata:\n",
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8192</span>\n",
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2048</span>\n",
" model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
"- metadata:\n",
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
" model_id: nvidia/nv-embedqa-e5-v5\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
"- metadata:\n",
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4096</span>\n",
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
"- metadata:\n",
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
" model_id: snowflake/arctic-embed-l\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: snowflake/arctic-embed-l\n",
"providers:\n",
" agents:\n",
" - config:\n",
" persistence_store:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">agents_store.db</span>\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" datasetio:\n",
" - config:\n",
" kvstore:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">localfs_datasetio.db</span>\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: localfs\n",
" provider_type: inline::localfs\n",
" eval:\n",
" - config:\n",
" kvstore:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">meta_reference_eval.db</span>\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" inference:\n",
" - config:\n",
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
" url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nim.int.aire.nvidia.com</span>\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" post_training:\n",
" - config:\n",
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
" customizer_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
" dataset_namespace: default\n",
" project_id: test-project\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" safety:\n",
" - config:\n",
" config_id: self-check\n",
" guardrails_service_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" scoring:\n",
" - config: <span style=\"font-weight: bold\">{}</span>\n",
" provider_id: basic\n",
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::ba</span>sic\n",
" telemetry:\n",
" - config:\n",
" service_name: <span style=\"color: #008000; text-decoration-color: #008000\">\"\\u200B\"</span>\n",
" sinks: sqlite\n",
" sqlite_db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">trace_store.db</span>\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" tool_runtime:\n",
" - config: <span style=\"font-weight: bold\">{}</span>\n",
" provider_id: rag-runtime\n",
" provider_type: inline::rag-runtime\n",
" vector_io:\n",
" - config:\n",
" kvstore:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">faiss_store.db</span>\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: faiss\n",
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::fa</span>iss\n",
"scoring_fns: <span style=\"font-weight: bold\">[]</span>\n",
"server:\n",
" auth: null\n",
" port: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8321</span>\n",
" tls_certfile: null\n",
" tls_keyfile: null\n",
"shields: <span style=\"font-weight: bold\">[]</span>\n",
"tool_groups:\n",
"- args: null\n",
" mcp_endpoint: null\n",
" provider_id: rag-runtime\n",
" toolgroup_id: builtin::rag\n",
"vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
"version: <span style=\"color: #008000; text-decoration-color: #008000\">'2'</span>\n",
"\n",
"</pre>\n"
],
"text/plain": [
"apis:\n",
"- agents\n",
"- datasetio\n",
"- eval\n",
"- inference\n",
"- post_training\n",
"- safety\n",
"- scoring\n",
"- telemetry\n",
"- tool_runtime\n",
"- vector_io\n",
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"container_image: null\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: nvidia\n",
"logging: null\n",
"metadata_store:\n",
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
" namespace: null\n",
" type: sqlite\n",
"models:\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama3-8b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-8b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-8B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-8b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama3-70b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-70b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-70B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama3-70b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-1B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - llm\n",
" provider_id: nvidia\n",
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
"- metadata:\n",
" context_length: \u001b[1;36m8192\u001b[0m\n",
" embedding_dimension: \u001b[1;36m2048\u001b[0m\n",
" model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
"- metadata:\n",
" context_length: \u001b[1;36m512\u001b[0m\n",
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
" model_id: nvidia/nv-embedqa-e5-v5\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
"- metadata:\n",
" context_length: \u001b[1;36m512\u001b[0m\n",
" embedding_dimension: \u001b[1;36m4096\u001b[0m\n",
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
"- metadata:\n",
" context_length: \u001b[1;36m512\u001b[0m\n",
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
" model_id: snowflake/arctic-embed-l\n",
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
" - embedding\n",
" provider_id: nvidia\n",
" provider_model_id: snowflake/arctic-embed-l\n",
"providers:\n",
" agents:\n",
" - config:\n",
" persistence_store:\n",
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95magents_store.db\u001b[0m\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" datasetio:\n",
" - config:\n",
" kvstore:\n",
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mlocalfs_datasetio.db\u001b[0m\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: localfs\n",
" provider_type: inline::localfs\n",
" eval:\n",
" - config:\n",
" kvstore:\n",
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mmeta_reference_eval.db\u001b[0m\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" inference:\n",
" - config:\n",
" api_key: \u001b[32m'********'\u001b[0m\n",
" url: \u001b[4;94mhttps://nim.int.aire.nvidia.com\u001b[0m\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" post_training:\n",
" - config:\n",
" api_key: \u001b[32m'********'\u001b[0m\n",
" customizer_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
" dataset_namespace: default\n",
" project_id: test-project\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" safety:\n",
" - config:\n",
" config_id: self-check\n",
" guardrails_service_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
" provider_id: nvidia\n",
" provider_type: remote::nvidia\n",
" scoring:\n",
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" provider_id: basic\n",
" provider_type: inlin\u001b[1;92me::ba\u001b[0msic\n",
" telemetry:\n",
" - config:\n",
" service_name: \u001b[32m\"\\u200B\"\u001b[0m\n",
" sinks: sqlite\n",
" sqlite_db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mtrace_store.db\u001b[0m\n",
" provider_id: meta-reference\n",
" provider_type: inline::meta-reference\n",
" tool_runtime:\n",
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" provider_id: rag-runtime\n",
" provider_type: inline::rag-runtime\n",
" vector_io:\n",
" - config:\n",
" kvstore:\n",
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mfaiss_store.db\u001b[0m\n",
" namespace: null\n",
" type: sqlite\n",
" provider_id: faiss\n",
" provider_type: inlin\u001b[1;92me::fa\u001b[0miss\n",
"scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"server:\n",
" auth: null\n",
" port: \u001b[1;36m8321\u001b[0m\n",
" tls_certfile: null\n",
" tls_keyfile: null\n",
"shields: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"tool_groups:\n",
"- args: null\n",
" mcp_endpoint: null\n",
" provider_id: rag-runtime\n",
" toolgroup_id: builtin::rag\n",
"vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"version: \u001b[32m'2'\u001b[0m\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n", "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
"\n", "\n",
@ -634,6 +100,53 @@
"client.initialize()" "client.initialize()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Helper functions for waiting on jobs\n",
"from llama_stack.apis.common.job_types import JobStatus\n",
"\n",
"def wait_customization_job(job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
" start_time = time()\n",
"\n",
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
" sleep(polling_interval)\n",
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" if time() - start_time > timeout:\n",
" raise RuntimeError(f\"Customization Job {job_id} took more than {timeout} seconds.\")\n",
" \n",
" return job_status\n",
"\n",
"def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
" start_time = time()\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
" sleep(polling_interval)\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" if time() - start_time > timeout:\n",
" raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
"\n",
" return job_status\n"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -643,11 +156,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"sample_squad_test_dataset_name = \"jg-llama-stack\"\n", "sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
"namespace = \"default\"\n", "namespace = \"default\"\n",
"repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\"" "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
] ]
@ -767,12 +280,160 @@
"TODO: Implement this section after Evalutor integration is done." "TODO: Implement this section after Evalutor integration is done."
] ]
}, },
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"benchmark_id = \"jg-llama-stack-3\""
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": [
"# Register a benchmark, which creates an Evaluation Config\n",
"simple_eval_config = {\n",
" \"benchmark_id\": benchmark_id,\n",
" \"dataset_id\": \"\",\n",
" \"scoring_functions\": [],\n",
" \"metadata\": {\n",
" \"type\": \"custom\",\n",
" \"params\": {\n",
" \"parallelism\": 8\n",
" },\n",
" \"tasks\": {\n",
" \"qa\": {\n",
" \"type\": \"completion\",\n",
" \"params\": {\n",
" \"template\": {\n",
" \"prompt\": \"{{prompt}}\",\n",
" \"max_tokens\": 200\n",
" }\n",
" },\n",
" \"dataset\": {\n",
" \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
" },\n",
" \"metrics\": {\n",
" \"bleu\": {\n",
" \"type\": \"bleu\",\n",
" \"params\": {\n",
" \"references\": [\n",
" \"{{ideal_response}}\"\n",
" ]\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"response = client.benchmarks.register(\n",
" benchmark_id=benchmark_id,\n",
" dataset_id=repo_id,\n",
" scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
" metadata=simple_eval_config[\"metadata\"]\n",
")\n",
"print(f\"Created benchmark {benchmark_id}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for benchmark in client.benchmarks.list():\n",
" print(benchmark)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Launch a simple evaluation with the benchmark\n",
"response = client.eval.run_eval(\n",
" benchmark_id=benchmark_id,\n",
" benchmark_config={\n",
" \"eval_candidate\": {\n",
" \"type\": \"model\",\n",
" \"model\": \"meta/llama-3.1-8b-instruct\",\n",
" \"sampling_params\": {\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 1.0,\n",
" \"top_p\": 0.95,\n",
" },\n",
" \"max_tokens\": 4096,\n",
" \"repeat_penalty\": 1.0,\n",
" },\n",
" }\n",
" }\n",
")\n",
"job_id = response.model_dump()[\"job_id\"]\n",
"print(f\"Created evaluation job {job_id}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait for the job to complete\n",
"job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Job {job_id} status: {job.status}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
"print(f\"Job results: {job_results.model_dump()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract bleu score and assert it's within range\n",
"initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
"print(f\"Initial bleu score: {initial_bleu_score}\")\n",
"\n",
"assert initial_bleu_score >= 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract accuracy and assert it's within range\n",
"initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
"print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
"\n",
"assert initial_accuracy_score >= 0.5"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -827,11 +488,17 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Customized model isn't available in the list of models, so this check doesn't work.\n", "# Wait for the job to complete\n",
"# customized_model = client.models.retrieve(f\"{NAMESPACE}/{CUSTOMIZED_MODEL_DIR}\")\n", "job_status = wait_customization_job(job_id=job_id)"
"# assert customized_model is not None\n", ]
"job_status = client.post_training.job.status(job_uuid=job_id)\n", },
"print(f\"Job status: {job_status.status}\")" {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Job {job_id} status: {job_status}\")"
] ]
}, },
{ {
@ -840,10 +507,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# TODO: This doesn't work - errors with model_id not found.\n",
"# Depends on https://github.com/meta-llama/llama-stack/pull/1859\n",
"# Verify that inference with the new model works\n", "# Verify that inference with the new model works\n",
"\n",
"from llama_stack.apis.models.models import ModelType\n", "from llama_stack.apis.models.models import ModelType\n",
"\n", "\n",
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
@ -853,14 +517,15 @@
"# provider_id=\"nvidia\",\n", "# provider_id=\"nvidia\",\n",
"# )\n", "# )\n",
"\n", "\n",
"response = client.inference.completion(\n", "# TODO: This won't work until the code above works - errors with model_id not found.\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n", "# response = client.inference.completion(\n",
" stream=False,\n", "# content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n", "# stream=False,\n",
" sampling_params={\n", "# model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
" \"max_tokens\": 50,\n", "# sampling_params={\n",
" },\n", "# \"max_tokens\": 50,\n",
")" "# },\n",
"# )"
] ]
}, },
{ {
@ -868,7 +533,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## TODO: Evaluate Customized Model\n", "## TODO: Evaluate Customized Model\n",
"Implement this section after Evalutor integration is done." "Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
] ]
}, },
{ {
@ -1078,39 +743,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"Shield(identifier='self-check', provider_id='nvidia', provider_resource_id='self-check', type='shield', params={}, access_attributes=None)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")" "client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'model': 'self-check', 'messages': [{'role': 'system', 'content': 'You are stupid.'}], 'temperature': 1.0, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'max_tokens': 160, 'stream': False, 'guardrails': {'config_id': 'self-check'}}\n",
"{'status': 'success', 'rails_status': {'self check input': {'status': 'success'}}, 'guardrails_data': {'log': {'activated_rails': [], 'stats': {'total_duration': 0.0007240772247314453, 'llm_calls_duration': 0.0, 'llm_calls_count': 0, 'llm_calls_total_prompt_tokens': 0, 'llm_calls_total_completion_tokens': 0, 'llm_calls_total_tokens': 0}}}}\n",
"Safety response: RunShieldResponse(violation=None)\n"
]
}
],
"source": [ "source": [
"# Check inference with guardrails\n", "# Check inference with guardrails\n",
"# TODO: For some reason, `role: \"user\"` returns a 422 error.\n", "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
@ -1154,7 +798,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.2" "version": "3.11.10"
} }
}, },
"nbformat": 4, "nbformat": 4,

View file

@ -7,7 +7,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|-----|-------------| |-----|-------------|
| agents | `inline::meta-reference` | | agents | `inline::meta-reference` |
| datasetio | `inline::localfs` | | datasetio | `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `remote::nvidia` |
| inference | `remote::nvidia` | | inference | `remote::nvidia` |
| post_training | `remote::nvidia` | | post_training | `remote::nvidia` |
| safety | `remote::nvidia` | | safety | `remote::nvidia` |
@ -29,6 +29,7 @@ The following environment variables can be configured:
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`) - `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`) - `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`) - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`) - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`) - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

View file

@ -6,7 +6,7 @@
from typing import List from typing import List
from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
def available_providers() -> List[ProviderSpec]: def available_providers() -> List[ProviderSpec]:
@ -25,4 +25,22 @@ def available_providers() -> List[ProviderSpec]:
Api.agents, Api.agents,
], ],
), ),
remote_provider_spec(
api=Api.eval,
adapter=AdapterSpec(
adapter_type="nvidia",
pip_packages=[
"requests",
],
module="llama_stack.providers.remote.eval.nvidia",
config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
),
api_dependencies=[
Api.datasetio,
Api.datasets,
Api.scoring,
Api.inference,
Api.agents,
],
),
] ]

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,126 @@
# NVIDIA NeMo Evaluator Eval Provider
## Overview
For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
### Example for register an academic benchmark
```
POST /eval/benchmarks
```
```json
{
"benchmark_id": "mmlu",
"dataset_id": "",
"scoring_functions": [],
"metadata": {
"type": "mmlu"
}
}
```
### Example for register a custom evaluation
```
POST /eval/benchmarks
```
```json
{
"benchmark_id": "my-custom-benchmark",
"dataset_id": "",
"scoring_functions": [],
"metadata": {
"type": "custom",
"params": {
"parallelism": 8
},
"tasks": {
"qa": {
"type": "completion",
"params": {
"template": {
"prompt": "{{prompt}}",
"max_tokens": 200
}
},
"dataset": {
"files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
},
"metrics": {
"bleu": {
"type": "bleu",
"params": {
"references": [
"{{ideal_response}}"
]
}
}
}
}
}
}
}
```
### Example for triggering a benchmark/custom evaluation
```
POST /eval/benchmarks/{benchmark_id}/jobs
```
```json
{
"benchmark_id": "my-custom-benchmark",
"benchmark_config": {
"eval_candidate": {
"type": "model",
"model": "meta/llama-3.1-8b-instruct",
"sampling_params": {
"max_tokens": 100,
"temperature": 0.7
}
},
"scoring_params": {}
}
}
```
Response example:
```json
{
"job_id": "1234",
"status": "in_progress"
}
```
### Example for getting the status of a job
```
GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
```
### Example for cancelling a job
```
POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
```
### Example for getting the results
```
GET /eval/benchmarks/{benchmark_id}/results
```
```json
{
"generations": [],
"scores": {
"{benchmark_id}": {
"score_rows": [],
"aggregated_results": {
"tasks": {},
"groups": {}
}
}
}
}
```

View file

@ -0,0 +1,31 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict
from llama_stack.distribution.datatypes import Api
from .config import NVIDIAEvalConfig
async def get_adapter_impl(
config: NVIDIAEvalConfig,
deps: Dict[Api, Any],
):
from .eval import NVIDIAEvalImpl
impl = NVIDIAEvalImpl(
config,
deps[Api.datasetio],
deps[Api.datasets],
deps[Api.scoring],
deps[Api.inference],
deps[Api.agents],
)
await impl.initialize()
return impl
__all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]

View file

@ -0,0 +1,29 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from typing import Any, Dict
from pydantic import BaseModel, Field
class NVIDIAEvalConfig(BaseModel):
"""
Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
Attributes:
evaluator_service_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
"""
evaluator_service_url: str = Field(
default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
description="The url for accessing the evaluator service",
)
@classmethod
def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
return {
"evaluator_service_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
}

View file

@ -0,0 +1,147 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List
import requests
from llama_stack.apis.agents import Agents
from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.inference import Inference
from llama_stack.apis.scoring import Scoring, ScoringResult
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
from .....apis.common.job_types import Job, JobStatus
from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
from .config import NVIDIAEvalConfig
DEFAULT_NAMESPACE = "nvidia"
class NVIDIAEvalImpl(
Eval,
BenchmarksProtocolPrivate,
):
def __init__(
self,
config: NVIDIAEvalConfig,
datasetio_api: DatasetIO,
datasets_api: Datasets,
scoring_api: Scoring,
inference_api: Inference,
agents_api: Agents,
) -> None:
self.config = config
self.datasetio_api = datasetio_api
self.datasets_api = datasets_api
self.scoring_api = scoring_api
self.inference_api = inference_api
self.agents_api = agents_api
async def initialize(self) -> None: ...
async def shutdown(self) -> None: ...
async def _evaluator_get(self, path):
"""Helper for making GET requests to the evaluator service."""
response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
response.raise_for_status()
return response.json()
async def _evaluator_post(self, path, data):
"""Helper for making POST requests to the evaluator service."""
response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
response.raise_for_status()
return response.json()
async def register_benchmark(self, task_def: Benchmark) -> None:
"""Register a benchmark as an evaluation configuration."""
await self._evaluator_post(
"/v1/evaluation/configs",
{
"namespace": DEFAULT_NAMESPACE,
"name": task_def.benchmark_id,
# metadata is copied to request body as-is
**task_def.metadata,
},
)
async def run_eval(
self,
benchmark_id: str,
benchmark_config: BenchmarkConfig,
) -> Job:
"""Run an evaluation job for a benchmark."""
model = (
benchmark_config.eval_candidate.model
if benchmark_config.eval_candidate.type == "model"
else benchmark_config.eval_candidate.config.model
)
result = await self._evaluator_post(
"/v1/evaluation/jobs",
{
"config": f"{DEFAULT_NAMESPACE}/{benchmark_id}",
"target": {"type": "model", "model": model},
},
)
return Job(job_id=result["id"], status=JobStatus.in_progress)
async def evaluate_rows(
self,
benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
benchmark_config: BenchmarkConfig,
) -> EvaluateResponse:
raise NotImplementedError()
async def job_status(self, benchmark_id: str, job_id: str) -> Job:
"""Get the status of an evaluation job.
EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
"""
result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}")
result_status = result["status"]
job_status = JobStatus.failed
if result_status in ["created", "pending"]:
job_status = JobStatus.scheduled
elif result_status in ["running"]:
job_status = JobStatus.in_progress
elif result_status in ["completed"]:
job_status = JobStatus.completed
elif result_status in ["cancelled"]:
job_status = JobStatus.cancelled
return Job(job_id=job_id, status=job_status)
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
"""Cancel the evaluation job."""
await self._evaluator_post(f"/v1/evaluation/jobs/{job_id}/cancel", {})
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
"""Returns the results of the evaluation job."""
job = await self.job_status(benchmark_id, job_id)
status = job.status
if not status or status != JobStatus.completed:
raise ValueError(f"Job {job_id} not completed. Status: {status.value}")
result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}/results")
return EvaluateResponse(
# TODO: these are stored in detailed results on NeMo Evaluator side; can be added
generations=[],
scores={
benchmark_id: ScoringResult(
score_rows=[],
aggregated_results=result,
)
},
)

View file

@ -95,7 +95,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
for _ in range(self.config.max_retries): for _ in range(self.config.max_retries):
# TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/` # TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
async with self.session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response: async with self.session.request(
method, url, params=params, json=json, verify_ssl=False, **kwargs
) as response:
if response.status >= 400: if response.status >= 400:
error_data = await response.json() error_data = await response.json()
raise Exception(f"API request failed: {error_data}") raise Exception(f"API request failed: {error_data}")

View file

@ -437,12 +437,10 @@
"aiosqlite", "aiosqlite",
"blobfile", "blobfile",
"chardet", "chardet",
"emoji",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
"fire", "fire",
"httpx", "httpx",
"langdetect",
"matplotlib", "matplotlib",
"nltk", "nltk",
"numpy", "numpy",
@ -454,7 +452,6 @@
"psycopg2-binary", "psycopg2-binary",
"pymongo", "pymongo",
"pypdf", "pypdf",
"pythainlp",
"redis", "redis",
"requests", "requests",
"scikit-learn", "scikit-learn",
@ -462,7 +459,6 @@
"sentencepiece", "sentencepiece",
"tqdm", "tqdm",
"transformers", "transformers",
"tree_sitter",
"uvicorn" "uvicorn"
], ],
"ollama": [ "ollama": [

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
distribution_spec: distribution_spec:
description: Use NVIDIA NIM for running LLM inference and safety description: Use NVIDIA NIM for running LLM inference, evaluation and safety
providers: providers:
inference: inference:
- remote::nvidia - remote::nvidia
@ -13,7 +13,7 @@ distribution_spec:
telemetry: telemetry:
- inline::meta-reference - inline::meta-reference
eval: eval:
- inline::meta-reference - remote::nvidia
post_training: post_training:
- remote::nvidia - remote::nvidia
datasetio: datasetio:

View file

@ -7,6 +7,7 @@
from pathlib import Path from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
@ -20,7 +21,7 @@ def get_distribution_template() -> DistributionTemplate:
"safety": ["remote::nvidia"], "safety": ["remote::nvidia"],
"agents": ["inline::meta-reference"], "agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"],
"eval": ["inline::meta-reference"], "eval": ["remote::nvidia"],
"post_training": ["remote::nvidia"], "post_training": ["remote::nvidia"],
"datasetio": ["inline::localfs"], "datasetio": ["inline::localfs"],
"scoring": ["inline::basic"], "scoring": ["inline::basic"],
@ -37,6 +38,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::nvidia", provider_type="remote::nvidia",
config=NVIDIASafetyConfig.sample_run_config(), config=NVIDIASafetyConfig.sample_run_config(),
) )
eval_provider = Provider(
provider_id="nvidia",
provider_type="remote::nvidia",
config=NVIDIAEvalConfig.sample_run_config(),
)
inference_model = ModelInput( inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="nvidia", provider_id="nvidia",
@ -60,7 +66,7 @@ def get_distribution_template() -> DistributionTemplate:
return DistributionTemplate( return DistributionTemplate(
name="nvidia", name="nvidia",
distro_type="remote_hosted", distro_type="remote_hosted",
description="Use NVIDIA NIM for running LLM inference and safety", description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
container_image=None, container_image=None,
template_path=Path(__file__).parent / "doc_template.md", template_path=Path(__file__).parent / "doc_template.md",
providers=providers, providers=providers,
@ -69,6 +75,7 @@ def get_distribution_template() -> DistributionTemplate:
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider],
"eval": [eval_provider],
}, },
default_models=default_models, default_models=default_models,
default_tool_groups=default_tool_groups, default_tool_groups=default_tool_groups,
@ -78,7 +85,8 @@ def get_distribution_template() -> DistributionTemplate:
"inference": [ "inference": [
inference_provider, inference_provider,
safety_provider, safety_provider,
] ],
"eval": [eval_provider],
}, },
default_models=[inference_model, safety_model], default_models=[inference_model, safety_model],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
@ -119,6 +127,10 @@ def get_distribution_template() -> DistributionTemplate:
"http://0.0.0.0:7331", "http://0.0.0.0:7331",
"URL for the NeMo Guardrails Service", "URL for the NeMo Guardrails Service",
), ),
"NVIDIA_EVALUATOR_URL": (
"http://0.0.0.0:7331",
"URL for the NeMo Evaluator Service",
),
"INFERENCE_MODEL": ( "INFERENCE_MODEL": (
"Llama3.1-8B-Instruct", "Llama3.1-8B-Instruct",
"Inference model", "Inference model",

View file

@ -53,13 +53,10 @@ providers:
sinks: ${env.TELEMETRY_SINKS:console,sqlite} sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
eval: eval:
- provider_id: meta-reference - provider_id: nvidia
provider_type: inline::meta-reference provider_type: remote::nvidia
config: config:
kvstore: evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
post_training: post_training:
- provider_id: nvidia - provider_id: nvidia
provider_type: remote::nvidia provider_type: remote::nvidia

View file

@ -48,13 +48,10 @@ providers:
sinks: ${env.TELEMETRY_SINKS:console,sqlite} sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
eval: eval:
- provider_id: meta-reference - provider_id: nvidia
provider_type: inline::meta-reference provider_type: remote::nvidia
config: config:
kvstore: evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
post_training: post_training:
- provider_id: nvidia - provider_id: nvidia
provider_type: remote::nvidia provider_type: remote::nvidia

View file

@ -0,0 +1,203 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import unittest
from unittest.mock import MagicMock, patch
import pytest
from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
MOCK_DATASET_ID = "default/test-dataset"
MOCK_BENCHMARK_ID = "test-benchmark"
class TestNVIDIAEvalImpl(unittest.TestCase):
def setUp(self):
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
# Create mock APIs
self.datasetio_api = MagicMock()
self.datasets_api = MagicMock()
self.scoring_api = MagicMock()
self.inference_api = MagicMock()
self.agents_api = MagicMock()
self.config = NVIDIAEvalConfig(
evaluator_service_url=os.environ["NVIDIA_EVALUATOR_URL"],
)
self.eval_impl = NVIDIAEvalImpl(
config=self.config,
datasetio_api=self.datasetio_api,
datasets_api=self.datasets_api,
scoring_api=self.scoring_api,
inference_api=self.inference_api,
agents_api=self.agents_api,
)
# Mock the HTTP request methods
self.evaluator_get_patcher = patch(
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
)
self.evaluator_post_patcher = patch(
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
)
self.mock_evaluator_get = self.evaluator_get_patcher.start()
self.mock_evaluator_post = self.evaluator_post_patcher.start()
# Set up async test helper
# self.run_async = self._create_async_helper()
def tearDown(self):
"""Clean up after each test."""
self.evaluator_get_patcher.stop()
self.evaluator_post_patcher.stop()
def _assert_request_body(self, expected_json):
"""Helper method to verify request body in Evaluator POST request is correct"""
call_args = self.mock_evaluator_post.call_args
actual_json = call_args[0][1]
# Check that all expected keys contain the expected values in the actual JSON
for key, value in expected_json.items():
assert key in actual_json, f"Key '{key}' missing in actual JSON"
if isinstance(value, dict):
for nested_key, nested_value in value.items():
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
else:
assert actual_json[key] == value, f"Value mismatch for '{key}'"
@pytest.fixture(autouse=True)
def inject_fixtures(self, run_async):
self.run_async = run_async
def test_register_benchmark(self):
eval_config = {
"type": "custom",
"params": {"parallelism": 8},
"tasks": {
"qa": {
"type": "completion",
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
}
},
}
benchmark = Benchmark(
provider_id="nvidia",
type="benchmark",
identifier=MOCK_BENCHMARK_ID,
dataset_id=MOCK_DATASET_ID,
scoring_functions=["basic::equality"],
metadata=eval_config,
)
# Mock Evaluator API response
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
self.mock_evaluator_post.return_value = mock_evaluator_response
# Register the benchmark
self.run_async(self.eval_impl.register_benchmark(benchmark))
# Verify the Evaluator API was called correctly
self.mock_evaluator_post.assert_called_once()
self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
def test_run_eval(self):
benchmark_config = BenchmarkConfig(
eval_candidate=ModelCandidate(
type="model",
model="meta/llama-3.1-8b-instruct",
sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
)
)
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "created"}
self.mock_evaluator_post.return_value = mock_evaluator_response
# Run the Evaluation job
result = self.run_async(
self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
)
# Verify the Evaluator API was called correctly
self.mock_evaluator_post.assert_called_once()
self._assert_request_body(
{
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
"target": {"type": "model", "model": benchmark_config.eval_candidate.model},
}
)
# Verify the result
assert isinstance(result, Job)
assert result.job_id == "job-123"
assert result.status == JobStatus.in_progress
def test_job_status(self):
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "completed"}
self.mock_evaluator_get.return_value = mock_evaluator_response
# Get the Evaluation job
result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
# Verify the result
assert isinstance(result, Job)
assert result.job_id == "job-123"
assert result.status == JobStatus.completed
# Verify the API was called correctly
self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
def test_job_cancel(self):
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
self.mock_evaluator_post.return_value = mock_evaluator_response
# Cancel the Evaluation job
self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
# Verify the API was called correctly
self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
def test_job_result(self):
# Mock Evaluator API responses
mock_job_status_response = {"id": "job-123", "status": "completed"}
mock_job_results_response = {
"id": "job-123",
"status": "completed",
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
}
self.mock_evaluator_get.side_effect = [
mock_job_status_response, # First call to retrieve job
mock_job_results_response, # Second call to retrieve job results
]
# Get the Evaluation job results
result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
# Verify the result
assert isinstance(result, EvaluateResponse)
assert MOCK_BENCHMARK_ID in result.scores
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
# Verify the API was called correctly
assert self.mock_evaluator_get.call_count == 2
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")