mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-21 12:09:40 +00:00
In-progress: e2e notebook with partial Eval integration
This commit is contained in:
parent
861962fa80
commit
c04ab0133d
19 changed files with 832 additions and 624 deletions
|
@ -31,9 +31,9 @@
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# NVIDIA URLs\n",
|
"# NVIDIA URLs\n",
|
||||||
"NDS_URL = \"\"\n",
|
"NDS_URL = \"https://datastore.int.aire.nvidia.com\"\n",
|
||||||
"NEMO_URL = \"\"\n",
|
"NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
|
||||||
"NIM_URL = \"\"\n",
|
"NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Inference env vars\n",
|
"# Inference env vars\n",
|
||||||
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
|
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
|
||||||
|
@ -51,12 +51,15 @@
|
||||||
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
|
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Guardrails env vars\n",
|
"# Guardrails env vars\n",
|
||||||
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL"
|
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
|
||||||
|
"\n",
|
||||||
|
"# Evaluator env vars\n",
|
||||||
|
"os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -67,14 +70,14 @@
|
||||||
"from time import sleep, time\n",
|
"from time import sleep, time\n",
|
||||||
"from typing import Dict\n",
|
"from typing import Dict\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import aiohttp\n",
|
"# import aiohttp\n",
|
||||||
"import requests\n",
|
"# import requests\n",
|
||||||
"from huggingface_hub import HfApi\n",
|
"# from huggingface_hub import HfApi\n",
|
||||||
"\n",
|
"\n",
|
||||||
"os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
|
"# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
|
||||||
"os.environ[\"HF_TOKEN\"] = \"token\"\n",
|
"# os.environ[\"HF_TOKEN\"] = \"token\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
|
"# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -87,546 +90,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Warning: `bwrap` is not available. Code interpreter tool will not work correctly.\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Initializing NVIDIASafetyAdapter(https://nmp.int.aire.nvidia.com)...\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Using config <span style=\"color: #000080; text-decoration-color: #000080\">nvidia</span>:\n",
|
|
||||||
"</pre>\n"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
"Using config \u001b[34mnvidia\u001b[0m:\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">apis:\n",
|
|
||||||
"- agents\n",
|
|
||||||
"- datasetio\n",
|
|
||||||
"- eval\n",
|
|
||||||
"- inference\n",
|
|
||||||
"- post_training\n",
|
|
||||||
"- safety\n",
|
|
||||||
"- scoring\n",
|
|
||||||
"- telemetry\n",
|
|
||||||
"- tool_runtime\n",
|
|
||||||
"- vector_io\n",
|
|
||||||
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
|
||||||
"container_image: null\n",
|
|
||||||
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
|
||||||
"image_name: nvidia\n",
|
|
||||||
"logging: null\n",
|
|
||||||
"metadata_store:\n",
|
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
"models:\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama3-8b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama3-70b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-70B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-FP8\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
||||||
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8192</span>\n",
|
|
||||||
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2048</span>\n",
|
|
||||||
" model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
||||||
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
|
|
||||||
" model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
||||||
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4096</span>\n",
|
|
||||||
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
||||||
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
|
|
||||||
" model_id: snowflake/arctic-embed-l\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: snowflake/arctic-embed-l\n",
|
|
||||||
"providers:\n",
|
|
||||||
" agents:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" persistence_store:\n",
|
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">agents_store.db</span>\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" datasetio:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">localfs_datasetio.db</span>\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: localfs\n",
|
|
||||||
" provider_type: inline::localfs\n",
|
|
||||||
" eval:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">meta_reference_eval.db</span>\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" inference:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
|
|
||||||
" url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nim.int.aire.nvidia.com</span>\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" post_training:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
|
|
||||||
" customizer_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
|
|
||||||
" dataset_namespace: default\n",
|
|
||||||
" project_id: test-project\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" safety:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" config_id: self-check\n",
|
|
||||||
" guardrails_service_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" scoring:\n",
|
|
||||||
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" provider_id: basic\n",
|
|
||||||
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::ba</span>sic\n",
|
|
||||||
" telemetry:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" service_name: <span style=\"color: #008000; text-decoration-color: #008000\">\"\\u200B\"</span>\n",
|
|
||||||
" sinks: sqlite\n",
|
|
||||||
" sqlite_db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">trace_store.db</span>\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" tool_runtime:\n",
|
|
||||||
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
|
||||||
" provider_id: rag-runtime\n",
|
|
||||||
" provider_type: inline::rag-runtime\n",
|
|
||||||
" vector_io:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">faiss_store.db</span>\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: faiss\n",
|
|
||||||
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::fa</span>iss\n",
|
|
||||||
"scoring_fns: <span style=\"font-weight: bold\">[]</span>\n",
|
|
||||||
"server:\n",
|
|
||||||
" auth: null\n",
|
|
||||||
" port: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8321</span>\n",
|
|
||||||
" tls_certfile: null\n",
|
|
||||||
" tls_keyfile: null\n",
|
|
||||||
"shields: <span style=\"font-weight: bold\">[]</span>\n",
|
|
||||||
"tool_groups:\n",
|
|
||||||
"- args: null\n",
|
|
||||||
" mcp_endpoint: null\n",
|
|
||||||
" provider_id: rag-runtime\n",
|
|
||||||
" toolgroup_id: builtin::rag\n",
|
|
||||||
"vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
|
|
||||||
"version: <span style=\"color: #008000; text-decoration-color: #008000\">'2'</span>\n",
|
|
||||||
"\n",
|
|
||||||
"</pre>\n"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
"apis:\n",
|
|
||||||
"- agents\n",
|
|
||||||
"- datasetio\n",
|
|
||||||
"- eval\n",
|
|
||||||
"- inference\n",
|
|
||||||
"- post_training\n",
|
|
||||||
"- safety\n",
|
|
||||||
"- scoring\n",
|
|
||||||
"- telemetry\n",
|
|
||||||
"- tool_runtime\n",
|
|
||||||
"- vector_io\n",
|
|
||||||
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"container_image: null\n",
|
|
||||||
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"image_name: nvidia\n",
|
|
||||||
"logging: null\n",
|
|
||||||
"metadata_store:\n",
|
|
||||||
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
"models:\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama3-8b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-8B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama3-70b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-70B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-1B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
||||||
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - llm\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: \u001b[1;36m8192\u001b[0m\n",
|
|
||||||
" embedding_dimension: \u001b[1;36m2048\u001b[0m\n",
|
|
||||||
" model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
||||||
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
|
|
||||||
" model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
||||||
" embedding_dimension: \u001b[1;36m4096\u001b[0m\n",
|
|
||||||
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
||||||
"- metadata:\n",
|
|
||||||
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
||||||
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
|
|
||||||
" model_id: snowflake/arctic-embed-l\n",
|
|
||||||
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
||||||
" - embedding\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_model_id: snowflake/arctic-embed-l\n",
|
|
||||||
"providers:\n",
|
|
||||||
" agents:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" persistence_store:\n",
|
|
||||||
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95magents_store.db\u001b[0m\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" datasetio:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mlocalfs_datasetio.db\u001b[0m\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: localfs\n",
|
|
||||||
" provider_type: inline::localfs\n",
|
|
||||||
" eval:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mmeta_reference_eval.db\u001b[0m\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" inference:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" api_key: \u001b[32m'********'\u001b[0m\n",
|
|
||||||
" url: \u001b[4;94mhttps://nim.int.aire.nvidia.com\u001b[0m\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" post_training:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" api_key: \u001b[32m'********'\u001b[0m\n",
|
|
||||||
" customizer_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
|
|
||||||
" dataset_namespace: default\n",
|
|
||||||
" project_id: test-project\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" safety:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" config_id: self-check\n",
|
|
||||||
" guardrails_service_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
|
|
||||||
" provider_id: nvidia\n",
|
|
||||||
" provider_type: remote::nvidia\n",
|
|
||||||
" scoring:\n",
|
|
||||||
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" provider_id: basic\n",
|
|
||||||
" provider_type: inlin\u001b[1;92me::ba\u001b[0msic\n",
|
|
||||||
" telemetry:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" service_name: \u001b[32m\"\\u200B\"\u001b[0m\n",
|
|
||||||
" sinks: sqlite\n",
|
|
||||||
" sqlite_db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mtrace_store.db\u001b[0m\n",
|
|
||||||
" provider_id: meta-reference\n",
|
|
||||||
" provider_type: inline::meta-reference\n",
|
|
||||||
" tool_runtime:\n",
|
|
||||||
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
" provider_id: rag-runtime\n",
|
|
||||||
" provider_type: inline::rag-runtime\n",
|
|
||||||
" vector_io:\n",
|
|
||||||
" - config:\n",
|
|
||||||
" kvstore:\n",
|
|
||||||
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mfaiss_store.db\u001b[0m\n",
|
|
||||||
" namespace: null\n",
|
|
||||||
" type: sqlite\n",
|
|
||||||
" provider_id: faiss\n",
|
|
||||||
" provider_type: inlin\u001b[1;92me::fa\u001b[0miss\n",
|
|
||||||
"scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"server:\n",
|
|
||||||
" auth: null\n",
|
|
||||||
" port: \u001b[1;36m8321\u001b[0m\n",
|
|
||||||
" tls_certfile: null\n",
|
|
||||||
" tls_keyfile: null\n",
|
|
||||||
"shields: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"tool_groups:\n",
|
|
||||||
"- args: null\n",
|
|
||||||
" mcp_endpoint: null\n",
|
|
||||||
" provider_id: rag-runtime\n",
|
|
||||||
" toolgroup_id: builtin::rag\n",
|
|
||||||
"vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"version: \u001b[32m'2'\u001b[0m\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -634,6 +100,53 @@
|
||||||
"client.initialize()"
|
"client.initialize()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Helper functions for waiting on jobs\n",
|
||||||
|
"from llama_stack.apis.common.job_types import JobStatus\n",
|
||||||
|
"\n",
|
||||||
|
"def wait_customization_job(job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
|
||||||
|
" start_time = time()\n",
|
||||||
|
"\n",
|
||||||
|
" response = client.post_training.job.status(job_uuid=job_id)\n",
|
||||||
|
" job_status = response.status\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||||||
|
"\n",
|
||||||
|
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
|
||||||
|
" sleep(polling_interval)\n",
|
||||||
|
" response = client.post_training.job.status(job_uuid=job_id)\n",
|
||||||
|
" job_status = response.status\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||||||
|
"\n",
|
||||||
|
" if time() - start_time > timeout:\n",
|
||||||
|
" raise RuntimeError(f\"Customization Job {job_id} took more than {timeout} seconds.\")\n",
|
||||||
|
" \n",
|
||||||
|
" return job_status\n",
|
||||||
|
"\n",
|
||||||
|
"def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
|
||||||
|
" start_time = time()\n",
|
||||||
|
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||||||
|
"\n",
|
||||||
|
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
|
||||||
|
" sleep(polling_interval)\n",
|
||||||
|
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||||||
|
"\n",
|
||||||
|
" if time() - start_time > timeout:\n",
|
||||||
|
" raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
|
||||||
|
"\n",
|
||||||
|
" return job_status\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
@ -643,11 +156,11 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"sample_squad_test_dataset_name = \"jg-llama-stack\"\n",
|
"sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
|
||||||
"namespace = \"default\"\n",
|
"namespace = \"default\"\n",
|
||||||
"repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
|
"repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
|
||||||
]
|
]
|
||||||
|
@ -767,12 +280,160 @@
|
||||||
"TODO: Implement this section after Evalutor integration is done."
|
"TODO: Implement this section after Evalutor integration is done."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"benchmark_id = \"jg-llama-stack-3\""
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"# Register a benchmark, which creates an Evaluation Config\n",
|
||||||
|
"simple_eval_config = {\n",
|
||||||
|
" \"benchmark_id\": benchmark_id,\n",
|
||||||
|
" \"dataset_id\": \"\",\n",
|
||||||
|
" \"scoring_functions\": [],\n",
|
||||||
|
" \"metadata\": {\n",
|
||||||
|
" \"type\": \"custom\",\n",
|
||||||
|
" \"params\": {\n",
|
||||||
|
" \"parallelism\": 8\n",
|
||||||
|
" },\n",
|
||||||
|
" \"tasks\": {\n",
|
||||||
|
" \"qa\": {\n",
|
||||||
|
" \"type\": \"completion\",\n",
|
||||||
|
" \"params\": {\n",
|
||||||
|
" \"template\": {\n",
|
||||||
|
" \"prompt\": \"{{prompt}}\",\n",
|
||||||
|
" \"max_tokens\": 200\n",
|
||||||
|
" }\n",
|
||||||
|
" },\n",
|
||||||
|
" \"dataset\": {\n",
|
||||||
|
" \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"metrics\": {\n",
|
||||||
|
" \"bleu\": {\n",
|
||||||
|
" \"type\": \"bleu\",\n",
|
||||||
|
" \"params\": {\n",
|
||||||
|
" \"references\": [\n",
|
||||||
|
" \"{{ideal_response}}\"\n",
|
||||||
|
" ]\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"response = client.benchmarks.register(\n",
|
||||||
|
" benchmark_id=benchmark_id,\n",
|
||||||
|
" dataset_id=repo_id,\n",
|
||||||
|
" scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
|
||||||
|
" metadata=simple_eval_config[\"metadata\"]\n",
|
||||||
|
")\n",
|
||||||
|
"print(f\"Created benchmark {benchmark_id}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for benchmark in client.benchmarks.list():\n",
|
||||||
|
" print(benchmark)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"## Launch a simple evaluation with the benchmark\n",
|
||||||
|
"response = client.eval.run_eval(\n",
|
||||||
|
" benchmark_id=benchmark_id,\n",
|
||||||
|
" benchmark_config={\n",
|
||||||
|
" \"eval_candidate\": {\n",
|
||||||
|
" \"type\": \"model\",\n",
|
||||||
|
" \"model\": \"meta/llama-3.1-8b-instruct\",\n",
|
||||||
|
" \"sampling_params\": {\n",
|
||||||
|
" \"strategy\": {\n",
|
||||||
|
" \"type\": \"top_p\",\n",
|
||||||
|
" \"temperature\": 1.0,\n",
|
||||||
|
" \"top_p\": 0.95,\n",
|
||||||
|
" },\n",
|
||||||
|
" \"max_tokens\": 4096,\n",
|
||||||
|
" \"repeat_penalty\": 1.0,\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"job_id = response.model_dump()[\"job_id\"]\n",
|
||||||
|
"print(f\"Created evaluation job {job_id}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Wait for the job to complete\n",
|
||||||
|
"job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(f\"Job {job_id} status: {job.status}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||||||
|
"print(f\"Job results: {job_results.model_dump()}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Extract bleu score and assert it's within range\n",
|
||||||
|
"initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
|
||||||
|
"print(f\"Initial bleu score: {initial_bleu_score}\")\n",
|
||||||
|
"\n",
|
||||||
|
"assert initial_bleu_score >= 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Extract accuracy and assert it's within range\n",
|
||||||
|
"initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
|
||||||
|
"print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
|
||||||
|
"\n",
|
||||||
|
"assert initial_accuracy_score >= 0.5"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
@ -827,11 +488,17 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Customized model isn't available in the list of models, so this check doesn't work.\n",
|
"# Wait for the job to complete\n",
|
||||||
"# customized_model = client.models.retrieve(f\"{NAMESPACE}/{CUSTOMIZED_MODEL_DIR}\")\n",
|
"job_status = wait_customization_job(job_id=job_id)"
|
||||||
"# assert customized_model is not None\n",
|
]
|
||||||
"job_status = client.post_training.job.status(job_uuid=job_id)\n",
|
},
|
||||||
"print(f\"Job status: {job_status.status}\")"
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(f\"Job {job_id} status: {job_status}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -840,10 +507,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# TODO: This doesn't work - errors with model_id not found.\n",
|
|
||||||
"# Depends on https://github.com/meta-llama/llama-stack/pull/1859\n",
|
|
||||||
"# Verify that inference with the new model works\n",
|
"# Verify that inference with the new model works\n",
|
||||||
"\n",
|
|
||||||
"from llama_stack.apis.models.models import ModelType\n",
|
"from llama_stack.apis.models.models import ModelType\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
|
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
|
||||||
|
@ -853,14 +517,15 @@
|
||||||
"# provider_id=\"nvidia\",\n",
|
"# provider_id=\"nvidia\",\n",
|
||||||
"# )\n",
|
"# )\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.inference.completion(\n",
|
"# TODO: This won't work until the code above works - errors with model_id not found.\n",
|
||||||
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
"# response = client.inference.completion(\n",
|
||||||
" stream=False,\n",
|
"# content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||||
" model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
|
"# stream=False,\n",
|
||||||
" sampling_params={\n",
|
"# model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
|
||||||
" \"max_tokens\": 50,\n",
|
"# sampling_params={\n",
|
||||||
" },\n",
|
"# \"max_tokens\": 50,\n",
|
||||||
")"
|
"# },\n",
|
||||||
|
"# )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -868,7 +533,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## TODO: Evaluate Customized Model\n",
|
"## TODO: Evaluate Customized Model\n",
|
||||||
"Implement this section after Evalutor integration is done."
|
"Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1078,39 +743,18 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Shield(identifier='self-check', provider_id='nvidia', provider_resource_id='self-check', type='shield', params={}, access_attributes=None)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
|
"client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'model': 'self-check', 'messages': [{'role': 'system', 'content': 'You are stupid.'}], 'temperature': 1.0, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'max_tokens': 160, 'stream': False, 'guardrails': {'config_id': 'self-check'}}\n",
|
|
||||||
"{'status': 'success', 'rails_status': {'self check input': {'status': 'success'}}, 'guardrails_data': {'log': {'activated_rails': [], 'stats': {'total_duration': 0.0007240772247314453, 'llm_calls_duration': 0.0, 'llm_calls_count': 0, 'llm_calls_total_prompt_tokens': 0, 'llm_calls_total_completion_tokens': 0, 'llm_calls_total_tokens': 0}}}}\n",
|
|
||||||
"Safety response: RunShieldResponse(violation=None)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# Check inference with guardrails\n",
|
"# Check inference with guardrails\n",
|
||||||
"# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
|
"# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
|
||||||
|
@ -1154,7 +798,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.2"
|
"version": "3.11.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -7,7 +7,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
||||||
|-----|-------------|
|
|-----|-------------|
|
||||||
| agents | `inline::meta-reference` |
|
| agents | `inline::meta-reference` |
|
||||||
| datasetio | `inline::localfs` |
|
| datasetio | `inline::localfs` |
|
||||||
| eval | `inline::meta-reference` |
|
| eval | `remote::nvidia` |
|
||||||
| inference | `remote::nvidia` |
|
| inference | `remote::nvidia` |
|
||||||
| post_training | `remote::nvidia` |
|
| post_training | `remote::nvidia` |
|
||||||
| safety | `remote::nvidia` |
|
| safety | `remote::nvidia` |
|
||||||
|
@ -29,6 +29,7 @@ The following environment variables can be configured:
|
||||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||||
|
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
|
from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
|
||||||
|
|
||||||
|
|
||||||
def available_providers() -> List[ProviderSpec]:
|
def available_providers() -> List[ProviderSpec]:
|
||||||
|
@ -25,4 +25,22 @@ def available_providers() -> List[ProviderSpec]:
|
||||||
Api.agents,
|
Api.agents,
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
remote_provider_spec(
|
||||||
|
api=Api.eval,
|
||||||
|
adapter=AdapterSpec(
|
||||||
|
adapter_type="nvidia",
|
||||||
|
pip_packages=[
|
||||||
|
"requests",
|
||||||
|
],
|
||||||
|
module="llama_stack.providers.remote.eval.nvidia",
|
||||||
|
config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
|
||||||
|
),
|
||||||
|
api_dependencies=[
|
||||||
|
Api.datasetio,
|
||||||
|
Api.datasets,
|
||||||
|
Api.scoring,
|
||||||
|
Api.inference,
|
||||||
|
Api.agents,
|
||||||
|
],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
5
llama_stack/providers/remote/eval/__init__.py
Normal file
5
llama_stack/providers/remote/eval/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
126
llama_stack/providers/remote/eval/nvidia/README.md
Normal file
126
llama_stack/providers/remote/eval/nvidia/README.md
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
# NVIDIA NeMo Evaluator Eval Provider
|
||||||
|
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
|
||||||
|
|
||||||
|
Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
|
||||||
|
|
||||||
|
### Example for register an academic benchmark
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /eval/benchmarks
|
||||||
|
```
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"benchmark_id": "mmlu",
|
||||||
|
"dataset_id": "",
|
||||||
|
"scoring_functions": [],
|
||||||
|
"metadata": {
|
||||||
|
"type": "mmlu"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example for register a custom evaluation
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /eval/benchmarks
|
||||||
|
```
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"benchmark_id": "my-custom-benchmark",
|
||||||
|
"dataset_id": "",
|
||||||
|
"scoring_functions": [],
|
||||||
|
"metadata": {
|
||||||
|
"type": "custom",
|
||||||
|
"params": {
|
||||||
|
"parallelism": 8
|
||||||
|
},
|
||||||
|
"tasks": {
|
||||||
|
"qa": {
|
||||||
|
"type": "completion",
|
||||||
|
"params": {
|
||||||
|
"template": {
|
||||||
|
"prompt": "{{prompt}}",
|
||||||
|
"max_tokens": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dataset": {
|
||||||
|
"files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
|
||||||
|
},
|
||||||
|
"metrics": {
|
||||||
|
"bleu": {
|
||||||
|
"type": "bleu",
|
||||||
|
"params": {
|
||||||
|
"references": [
|
||||||
|
"{{ideal_response}}"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example for triggering a benchmark/custom evaluation
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /eval/benchmarks/{benchmark_id}/jobs
|
||||||
|
```
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"benchmark_id": "my-custom-benchmark",
|
||||||
|
"benchmark_config": {
|
||||||
|
"eval_candidate": {
|
||||||
|
"type": "model",
|
||||||
|
"model": "meta/llama-3.1-8b-instruct",
|
||||||
|
"sampling_params": {
|
||||||
|
"max_tokens": 100,
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"scoring_params": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response example:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "1234",
|
||||||
|
"status": "in_progress"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example for getting the status of a job
|
||||||
|
```
|
||||||
|
GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example for cancelling a job
|
||||||
|
```
|
||||||
|
POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example for getting the results
|
||||||
|
```
|
||||||
|
GET /eval/benchmarks/{benchmark_id}/results
|
||||||
|
```
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"generations": [],
|
||||||
|
"scores": {
|
||||||
|
"{benchmark_id}": {
|
||||||
|
"score_rows": [],
|
||||||
|
"aggregated_results": {
|
||||||
|
"tasks": {},
|
||||||
|
"groups": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
31
llama_stack/providers/remote/eval/nvidia/__init__.py
Normal file
31
llama_stack/providers/remote/eval/nvidia/__init__.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import Api
|
||||||
|
|
||||||
|
from .config import NVIDIAEvalConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def get_adapter_impl(
|
||||||
|
config: NVIDIAEvalConfig,
|
||||||
|
deps: Dict[Api, Any],
|
||||||
|
):
|
||||||
|
from .eval import NVIDIAEvalImpl
|
||||||
|
|
||||||
|
impl = NVIDIAEvalImpl(
|
||||||
|
config,
|
||||||
|
deps[Api.datasetio],
|
||||||
|
deps[Api.datasets],
|
||||||
|
deps[Api.scoring],
|
||||||
|
deps[Api.inference],
|
||||||
|
deps[Api.agents],
|
||||||
|
)
|
||||||
|
await impl.initialize()
|
||||||
|
return impl
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]
|
29
llama_stack/providers/remote/eval/nvidia/config.py
Normal file
29
llama_stack/providers/remote/eval/nvidia/config.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class NVIDIAEvalConfig(BaseModel):
|
||||||
|
"""
|
||||||
|
Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
evaluator_service_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
|
||||||
|
"""
|
||||||
|
|
||||||
|
evaluator_service_url: str = Field(
|
||||||
|
default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
|
||||||
|
description="The url for accessing the evaluator service",
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"evaluator_service_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
|
||||||
|
}
|
147
llama_stack/providers/remote/eval/nvidia/eval.py
Normal file
147
llama_stack/providers/remote/eval/nvidia/eval.py
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from llama_stack.apis.agents import Agents
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark
|
||||||
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
|
from llama_stack.apis.datasets import Datasets
|
||||||
|
from llama_stack.apis.inference import Inference
|
||||||
|
from llama_stack.apis.scoring import Scoring, ScoringResult
|
||||||
|
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
|
||||||
|
|
||||||
|
from .....apis.common.job_types import Job, JobStatus
|
||||||
|
from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
|
||||||
|
from .config import NVIDIAEvalConfig
|
||||||
|
|
||||||
|
DEFAULT_NAMESPACE = "nvidia"
|
||||||
|
|
||||||
|
|
||||||
|
class NVIDIAEvalImpl(
|
||||||
|
Eval,
|
||||||
|
BenchmarksProtocolPrivate,
|
||||||
|
):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: NVIDIAEvalConfig,
|
||||||
|
datasetio_api: DatasetIO,
|
||||||
|
datasets_api: Datasets,
|
||||||
|
scoring_api: Scoring,
|
||||||
|
inference_api: Inference,
|
||||||
|
agents_api: Agents,
|
||||||
|
) -> None:
|
||||||
|
self.config = config
|
||||||
|
self.datasetio_api = datasetio_api
|
||||||
|
self.datasets_api = datasets_api
|
||||||
|
self.scoring_api = scoring_api
|
||||||
|
self.inference_api = inference_api
|
||||||
|
self.agents_api = agents_api
|
||||||
|
|
||||||
|
async def initialize(self) -> None: ...
|
||||||
|
|
||||||
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
|
async def _evaluator_get(self, path):
|
||||||
|
"""Helper for making GET requests to the evaluator service."""
|
||||||
|
response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
async def _evaluator_post(self, path, data):
|
||||||
|
"""Helper for making POST requests to the evaluator service."""
|
||||||
|
response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
async def register_benchmark(self, task_def: Benchmark) -> None:
|
||||||
|
"""Register a benchmark as an evaluation configuration."""
|
||||||
|
await self._evaluator_post(
|
||||||
|
"/v1/evaluation/configs",
|
||||||
|
{
|
||||||
|
"namespace": DEFAULT_NAMESPACE,
|
||||||
|
"name": task_def.benchmark_id,
|
||||||
|
# metadata is copied to request body as-is
|
||||||
|
**task_def.metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_eval(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
benchmark_config: BenchmarkConfig,
|
||||||
|
) -> Job:
|
||||||
|
"""Run an evaluation job for a benchmark."""
|
||||||
|
model = (
|
||||||
|
benchmark_config.eval_candidate.model
|
||||||
|
if benchmark_config.eval_candidate.type == "model"
|
||||||
|
else benchmark_config.eval_candidate.config.model
|
||||||
|
)
|
||||||
|
result = await self._evaluator_post(
|
||||||
|
"/v1/evaluation/jobs",
|
||||||
|
{
|
||||||
|
"config": f"{DEFAULT_NAMESPACE}/{benchmark_id}",
|
||||||
|
"target": {"type": "model", "model": model},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return Job(job_id=result["id"], status=JobStatus.in_progress)
|
||||||
|
|
||||||
|
async def evaluate_rows(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
input_rows: List[Dict[str, Any]],
|
||||||
|
scoring_functions: List[str],
|
||||||
|
benchmark_config: BenchmarkConfig,
|
||||||
|
) -> EvaluateResponse:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
async def job_status(self, benchmark_id: str, job_id: str) -> Job:
|
||||||
|
"""Get the status of an evaluation job.
|
||||||
|
|
||||||
|
EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
|
||||||
|
JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
|
||||||
|
"""
|
||||||
|
result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}")
|
||||||
|
result_status = result["status"]
|
||||||
|
|
||||||
|
job_status = JobStatus.failed
|
||||||
|
if result_status in ["created", "pending"]:
|
||||||
|
job_status = JobStatus.scheduled
|
||||||
|
elif result_status in ["running"]:
|
||||||
|
job_status = JobStatus.in_progress
|
||||||
|
elif result_status in ["completed"]:
|
||||||
|
job_status = JobStatus.completed
|
||||||
|
elif result_status in ["cancelled"]:
|
||||||
|
job_status = JobStatus.cancelled
|
||||||
|
|
||||||
|
return Job(job_id=job_id, status=job_status)
|
||||||
|
|
||||||
|
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||||
|
"""Cancel the evaluation job."""
|
||||||
|
await self._evaluator_post(f"/v1/evaluation/jobs/{job_id}/cancel", {})
|
||||||
|
|
||||||
|
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||||
|
"""Returns the results of the evaluation job."""
|
||||||
|
|
||||||
|
job = await self.job_status(benchmark_id, job_id)
|
||||||
|
status = job.status
|
||||||
|
if not status or status != JobStatus.completed:
|
||||||
|
raise ValueError(f"Job {job_id} not completed. Status: {status.value}")
|
||||||
|
|
||||||
|
result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}/results")
|
||||||
|
|
||||||
|
return EvaluateResponse(
|
||||||
|
# TODO: these are stored in detailed results on NeMo Evaluator side; can be added
|
||||||
|
generations=[],
|
||||||
|
scores={
|
||||||
|
benchmark_id: ScoringResult(
|
||||||
|
score_rows=[],
|
||||||
|
aggregated_results=result,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
|
@ -95,7 +95,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
|
||||||
|
|
||||||
for _ in range(self.config.max_retries):
|
for _ in range(self.config.max_retries):
|
||||||
# TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
|
# TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
|
||||||
async with self.session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response:
|
async with self.session.request(
|
||||||
|
method, url, params=params, json=json, verify_ssl=False, **kwargs
|
||||||
|
) as response:
|
||||||
if response.status >= 400:
|
if response.status >= 400:
|
||||||
error_data = await response.json()
|
error_data = await response.json()
|
||||||
raise Exception(f"API request failed: {error_data}")
|
raise Exception(f"API request failed: {error_data}")
|
||||||
|
|
|
@ -437,12 +437,10 @@
|
||||||
"aiosqlite",
|
"aiosqlite",
|
||||||
"blobfile",
|
"blobfile",
|
||||||
"chardet",
|
"chardet",
|
||||||
"emoji",
|
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
"langdetect",
|
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"nltk",
|
"nltk",
|
||||||
"numpy",
|
"numpy",
|
||||||
|
@ -454,7 +452,6 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
"pythainlp",
|
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -462,7 +459,6 @@
|
||||||
"sentencepiece",
|
"sentencepiece",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
"transformers",
|
"transformers",
|
||||||
"tree_sitter",
|
|
||||||
"uvicorn"
|
"uvicorn"
|
||||||
],
|
],
|
||||||
"ollama": [
|
"ollama": [
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
version: '2'
|
version: '2'
|
||||||
distribution_spec:
|
distribution_spec:
|
||||||
description: Use NVIDIA NIM for running LLM inference and safety
|
description: Use NVIDIA NIM for running LLM inference, evaluation and safety
|
||||||
providers:
|
providers:
|
||||||
inference:
|
inference:
|
||||||
- remote::nvidia
|
- remote::nvidia
|
||||||
|
@ -13,7 +13,7 @@ distribution_spec:
|
||||||
telemetry:
|
telemetry:
|
||||||
- inline::meta-reference
|
- inline::meta-reference
|
||||||
eval:
|
eval:
|
||||||
- inline::meta-reference
|
- remote::nvidia
|
||||||
post_training:
|
post_training:
|
||||||
- remote::nvidia
|
- remote::nvidia
|
||||||
datasetio:
|
datasetio:
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
|
||||||
|
from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
|
||||||
from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
|
from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
|
||||||
from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
|
||||||
from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
|
from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
|
||||||
|
@ -20,7 +21,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
"safety": ["remote::nvidia"],
|
"safety": ["remote::nvidia"],
|
||||||
"agents": ["inline::meta-reference"],
|
"agents": ["inline::meta-reference"],
|
||||||
"telemetry": ["inline::meta-reference"],
|
"telemetry": ["inline::meta-reference"],
|
||||||
"eval": ["inline::meta-reference"],
|
"eval": ["remote::nvidia"],
|
||||||
"post_training": ["remote::nvidia"],
|
"post_training": ["remote::nvidia"],
|
||||||
"datasetio": ["inline::localfs"],
|
"datasetio": ["inline::localfs"],
|
||||||
"scoring": ["inline::basic"],
|
"scoring": ["inline::basic"],
|
||||||
|
@ -37,6 +38,11 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
provider_type="remote::nvidia",
|
provider_type="remote::nvidia",
|
||||||
config=NVIDIASafetyConfig.sample_run_config(),
|
config=NVIDIASafetyConfig.sample_run_config(),
|
||||||
)
|
)
|
||||||
|
eval_provider = Provider(
|
||||||
|
provider_id="nvidia",
|
||||||
|
provider_type="remote::nvidia",
|
||||||
|
config=NVIDIAEvalConfig.sample_run_config(),
|
||||||
|
)
|
||||||
inference_model = ModelInput(
|
inference_model = ModelInput(
|
||||||
model_id="${env.INFERENCE_MODEL}",
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
provider_id="nvidia",
|
provider_id="nvidia",
|
||||||
|
@ -60,7 +66,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
return DistributionTemplate(
|
return DistributionTemplate(
|
||||||
name="nvidia",
|
name="nvidia",
|
||||||
distro_type="remote_hosted",
|
distro_type="remote_hosted",
|
||||||
description="Use NVIDIA NIM for running LLM inference and safety",
|
description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
|
||||||
container_image=None,
|
container_image=None,
|
||||||
template_path=Path(__file__).parent / "doc_template.md",
|
template_path=Path(__file__).parent / "doc_template.md",
|
||||||
providers=providers,
|
providers=providers,
|
||||||
|
@ -69,6 +75,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
"run.yaml": RunConfigSettings(
|
"run.yaml": RunConfigSettings(
|
||||||
provider_overrides={
|
provider_overrides={
|
||||||
"inference": [inference_provider],
|
"inference": [inference_provider],
|
||||||
|
"eval": [eval_provider],
|
||||||
},
|
},
|
||||||
default_models=default_models,
|
default_models=default_models,
|
||||||
default_tool_groups=default_tool_groups,
|
default_tool_groups=default_tool_groups,
|
||||||
|
@ -78,7 +85,8 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
"inference": [
|
"inference": [
|
||||||
inference_provider,
|
inference_provider,
|
||||||
safety_provider,
|
safety_provider,
|
||||||
]
|
],
|
||||||
|
"eval": [eval_provider],
|
||||||
},
|
},
|
||||||
default_models=[inference_model, safety_model],
|
default_models=[inference_model, safety_model],
|
||||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
|
||||||
|
@ -119,6 +127,10 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
"http://0.0.0.0:7331",
|
"http://0.0.0.0:7331",
|
||||||
"URL for the NeMo Guardrails Service",
|
"URL for the NeMo Guardrails Service",
|
||||||
),
|
),
|
||||||
|
"NVIDIA_EVALUATOR_URL": (
|
||||||
|
"http://0.0.0.0:7331",
|
||||||
|
"URL for the NeMo Evaluator Service",
|
||||||
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
"Llama3.1-8B-Instruct",
|
"Llama3.1-8B-Instruct",
|
||||||
"Inference model",
|
"Inference model",
|
||||||
|
|
|
@ -53,13 +53,10 @@ providers:
|
||||||
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
|
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
|
||||||
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
|
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
|
||||||
eval:
|
eval:
|
||||||
- provider_id: meta-reference
|
- provider_id: nvidia
|
||||||
provider_type: inline::meta-reference
|
provider_type: remote::nvidia
|
||||||
config:
|
config:
|
||||||
kvstore:
|
evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
|
||||||
type: sqlite
|
|
||||||
namespace: null
|
|
||||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
|
|
||||||
post_training:
|
post_training:
|
||||||
- provider_id: nvidia
|
- provider_id: nvidia
|
||||||
provider_type: remote::nvidia
|
provider_type: remote::nvidia
|
||||||
|
|
|
@ -48,13 +48,10 @@ providers:
|
||||||
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
|
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
|
||||||
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
|
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
|
||||||
eval:
|
eval:
|
||||||
- provider_id: meta-reference
|
- provider_id: nvidia
|
||||||
provider_type: inline::meta-reference
|
provider_type: remote::nvidia
|
||||||
config:
|
config:
|
||||||
kvstore:
|
evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
|
||||||
type: sqlite
|
|
||||||
namespace: null
|
|
||||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
|
|
||||||
post_training:
|
post_training:
|
||||||
- provider_id: nvidia
|
- provider_id: nvidia
|
||||||
provider_type: remote::nvidia
|
provider_type: remote::nvidia
|
||||||
|
|
203
tests/unit/providers/nvidia/test_eval.py
Normal file
203
tests/unit/providers/nvidia/test_eval.py
Normal file
|
@ -0,0 +1,203 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark
|
||||||
|
from llama_stack.apis.common.job_types import Job, JobStatus
|
||||||
|
from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
|
||||||
|
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
|
||||||
|
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
|
||||||
|
|
||||||
|
MOCK_DATASET_ID = "default/test-dataset"
|
||||||
|
MOCK_BENCHMARK_ID = "test-benchmark"
|
||||||
|
|
||||||
|
|
||||||
|
class TestNVIDIAEvalImpl(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
|
||||||
|
|
||||||
|
# Create mock APIs
|
||||||
|
self.datasetio_api = MagicMock()
|
||||||
|
self.datasets_api = MagicMock()
|
||||||
|
self.scoring_api = MagicMock()
|
||||||
|
self.inference_api = MagicMock()
|
||||||
|
self.agents_api = MagicMock()
|
||||||
|
|
||||||
|
self.config = NVIDIAEvalConfig(
|
||||||
|
evaluator_service_url=os.environ["NVIDIA_EVALUATOR_URL"],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.eval_impl = NVIDIAEvalImpl(
|
||||||
|
config=self.config,
|
||||||
|
datasetio_api=self.datasetio_api,
|
||||||
|
datasets_api=self.datasets_api,
|
||||||
|
scoring_api=self.scoring_api,
|
||||||
|
inference_api=self.inference_api,
|
||||||
|
agents_api=self.agents_api,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock the HTTP request methods
|
||||||
|
self.evaluator_get_patcher = patch(
|
||||||
|
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
|
||||||
|
)
|
||||||
|
self.evaluator_post_patcher = patch(
|
||||||
|
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.mock_evaluator_get = self.evaluator_get_patcher.start()
|
||||||
|
self.mock_evaluator_post = self.evaluator_post_patcher.start()
|
||||||
|
|
||||||
|
# Set up async test helper
|
||||||
|
# self.run_async = self._create_async_helper()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
"""Clean up after each test."""
|
||||||
|
self.evaluator_get_patcher.stop()
|
||||||
|
self.evaluator_post_patcher.stop()
|
||||||
|
|
||||||
|
def _assert_request_body(self, expected_json):
|
||||||
|
"""Helper method to verify request body in Evaluator POST request is correct"""
|
||||||
|
call_args = self.mock_evaluator_post.call_args
|
||||||
|
actual_json = call_args[0][1]
|
||||||
|
|
||||||
|
# Check that all expected keys contain the expected values in the actual JSON
|
||||||
|
for key, value in expected_json.items():
|
||||||
|
assert key in actual_json, f"Key '{key}' missing in actual JSON"
|
||||||
|
|
||||||
|
if isinstance(value, dict):
|
||||||
|
for nested_key, nested_value in value.items():
|
||||||
|
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
|
||||||
|
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
|
||||||
|
else:
|
||||||
|
assert actual_json[key] == value, f"Value mismatch for '{key}'"
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def inject_fixtures(self, run_async):
|
||||||
|
self.run_async = run_async
|
||||||
|
|
||||||
|
def test_register_benchmark(self):
|
||||||
|
eval_config = {
|
||||||
|
"type": "custom",
|
||||||
|
"params": {"parallelism": 8},
|
||||||
|
"tasks": {
|
||||||
|
"qa": {
|
||||||
|
"type": "completion",
|
||||||
|
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
|
||||||
|
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
|
||||||
|
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
benchmark = Benchmark(
|
||||||
|
provider_id="nvidia",
|
||||||
|
type="benchmark",
|
||||||
|
identifier=MOCK_BENCHMARK_ID,
|
||||||
|
dataset_id=MOCK_DATASET_ID,
|
||||||
|
scoring_functions=["basic::equality"],
|
||||||
|
metadata=eval_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
|
||||||
|
self.mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Register the benchmark
|
||||||
|
self.run_async(self.eval_impl.register_benchmark(benchmark))
|
||||||
|
|
||||||
|
# Verify the Evaluator API was called correctly
|
||||||
|
self.mock_evaluator_post.assert_called_once()
|
||||||
|
self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
|
||||||
|
|
||||||
|
def test_run_eval(self):
|
||||||
|
benchmark_config = BenchmarkConfig(
|
||||||
|
eval_candidate=ModelCandidate(
|
||||||
|
type="model",
|
||||||
|
model="meta/llama-3.1-8b-instruct",
|
||||||
|
sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": "job-123", "status": "created"}
|
||||||
|
self.mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Run the Evaluation job
|
||||||
|
result = self.run_async(
|
||||||
|
self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the Evaluator API was called correctly
|
||||||
|
self.mock_evaluator_post.assert_called_once()
|
||||||
|
self._assert_request_body(
|
||||||
|
{
|
||||||
|
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
|
||||||
|
"target": {"type": "model", "model": benchmark_config.eval_candidate.model},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the result
|
||||||
|
assert isinstance(result, Job)
|
||||||
|
assert result.job_id == "job-123"
|
||||||
|
assert result.status == JobStatus.in_progress
|
||||||
|
|
||||||
|
def test_job_status(self):
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": "job-123", "status": "completed"}
|
||||||
|
self.mock_evaluator_get.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Get the Evaluation job
|
||||||
|
result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
||||||
|
|
||||||
|
# Verify the result
|
||||||
|
assert isinstance(result, Job)
|
||||||
|
assert result.job_id == "job-123"
|
||||||
|
assert result.status == JobStatus.completed
|
||||||
|
|
||||||
|
# Verify the API was called correctly
|
||||||
|
self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
|
||||||
|
|
||||||
|
def test_job_cancel(self):
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
|
||||||
|
self.mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Cancel the Evaluation job
|
||||||
|
self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
||||||
|
|
||||||
|
# Verify the API was called correctly
|
||||||
|
self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
|
||||||
|
|
||||||
|
def test_job_result(self):
|
||||||
|
# Mock Evaluator API responses
|
||||||
|
mock_job_status_response = {"id": "job-123", "status": "completed"}
|
||||||
|
mock_job_results_response = {
|
||||||
|
"id": "job-123",
|
||||||
|
"status": "completed",
|
||||||
|
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
|
||||||
|
}
|
||||||
|
self.mock_evaluator_get.side_effect = [
|
||||||
|
mock_job_status_response, # First call to retrieve job
|
||||||
|
mock_job_results_response, # Second call to retrieve job results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Get the Evaluation job results
|
||||||
|
result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
||||||
|
|
||||||
|
# Verify the result
|
||||||
|
assert isinstance(result, EvaluateResponse)
|
||||||
|
assert MOCK_BENCHMARK_ID in result.scores
|
||||||
|
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
|
||||||
|
|
||||||
|
# Verify the API was called correctly
|
||||||
|
assert self.mock_evaluator_get.call_count == 2
|
||||||
|
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
|
||||||
|
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
|
Loading…
Add table
Add a link
Reference in a new issue