mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-20 11:47:00 +00:00
495 lines
16 KiB
Text
495 lines
16 KiB
Text
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Part 3: Model Evaluation Using NeMo Evaluator"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import json\n",
|
||
"import requests\n",
|
||
"import random\n",
|
||
"from time import sleep, time\n",
|
||
"from openai import OpenAI\n",
|
||
"\n",
|
||
"from config import *"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Metadata associated with Datasets and Customization Jobs\n",
|
||
"os.environ[\"NVIDIA_USER_ID\"] = USER_ID\n",
|
||
"os.environ[\"NVIDIA_DATASET_NAMESPACE\"] = NMS_NAMESPACE\n",
|
||
"os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n",
|
||
"\n",
|
||
"## Inference env vars\n",
|
||
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
|
||
"\n",
|
||
"# Data Store env vars\n",
|
||
"os.environ[\"NVIDIA_DATASETS_URL\"] = NEMO_URL\n",
|
||
"\n",
|
||
"## Customizer env vars\n",
|
||
"os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n",
|
||
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
|
||
"\n",
|
||
"# Evaluator env vars\n",
|
||
"os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n",
|
||
"\n",
|
||
"# Guardrails env vars\n",
|
||
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||
"\n",
|
||
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
||
"client.initialize()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from llama_stack.apis.common.job_types import JobStatus\n",
|
||
"\n",
|
||
"def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
|
||
" start_time = time()\n",
|
||
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||
"\n",
|
||
" print(f\"Waiting for Evaluation job {job_id} to finish.\")\n",
|
||
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||
"\n",
|
||
" while job_status.status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n",
|
||
" sleep(polling_interval)\n",
|
||
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||
"\n",
|
||
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
|
||
"\n",
|
||
" if time() - start_time > timeout:\n",
|
||
" raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
|
||
"\n",
|
||
" return job_status"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Prerequisites: Configurations and Health Checks\n",
|
||
"Before you proceed, make sure that you completed the previous notebooks on data preparation and model fine-tuning to obtain the assets required to follow along."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Configure NeMo Microservices Endpoints\n",
|
||
"The following code imports necessary configurations and prints the endpoints for the NeMo Data Store, Entity Store, Customizer, Evaluator, and NIM, as well as the namespace and base model."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from config import *\n",
|
||
"\n",
|
||
"print(f\"Data Store endpoint: {NDS_URL}\")\n",
|
||
"print(f\"Entity Store, Customizer, Evaluator endpoint: {NEMO_URL}\")\n",
|
||
"print(f\"NIM endpoint: {NIM_URL}\")\n",
|
||
"print(f\"Namespace: {NMS_NAMESPACE}\")\n",
|
||
"print(f\"Base Model: {BASE_MODEL}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Check Available Models\n",
|
||
"Specify the customized model name that you got from the previous notebook to the following variable. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Populate this variable with the value from the previous notebook\n",
|
||
"# CUSTOMIZED_MODEL = \"\"\n",
|
||
"CUSTOMIZED_MODEL = \"jgulabrai-1/test-llama-stack@v1\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"The following code verifies that the model has been registed."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"models = client.models.list()\n",
|
||
"model_ids = [model.identifier for model in models]\n",
|
||
"\n",
|
||
"assert CUSTOMIZED_MODEL in model_ids, \\\n",
|
||
" f\"Model {CUSTOMIZED_MODEL} not registered\"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"The following code checks if the NIM endpoint hosts the model properly."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"resp = requests.get(f\"{NIM_URL}/v1/models\")\n",
|
||
"\n",
|
||
"models = resp.json().get(\"data\", [])\n",
|
||
"model_names = [model[\"id\"] for model in models]\n",
|
||
"\n",
|
||
"assert CUSTOMIZED_MODEL in model_names, \\\n",
|
||
" f\"Model {CUSTOMIZED_MODEL} not found\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Verify the Availability of the Datasets\n",
|
||
"In the previous notebook, we registered the test dataset along with the train and validation sets. \n",
|
||
"The following code performs a sanity check to validate the dataset has been registed with Llama Stack, and exists in NeMo Data Store."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"repo_id = f\"{NMS_NAMESPACE}/{DATASET_NAME}\" \n",
|
||
"print(repo_id)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"datasets = client.datasets.list()\n",
|
||
"dataset_ids = [dataset.identifier for dataset in datasets]\n",
|
||
"assert DATASET_NAME in dataset_ids, \\\n",
|
||
" f\"Dataset {DATASET_NAME} not registered\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
" # Sanity check to validate dataset\n",
|
||
"response = requests.get(url=f\"{NEMO_URL}/v1/datasets/{repo_id}\")\n",
|
||
"assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n",
|
||
"\n",
|
||
"print(\"Files URL:\", response.json()[\"files_url\"])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Step 1: Establish Baseline Accuracy Benchmark\n",
|
||
"First, we’ll assess the accuracy of the 'off-the-shelf' base model—pristine, untouched, and blissfully unaware of the transformative magic that is fine-tuning. \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 1.1: Create a Benchmark\n",
|
||
"Create a benchmark, which create an evaluation configuration object in NeMo Evaluator. For more information on various parameters, refer to the [NeMo Evaluator configuration](https://developer.nvidia.com/docs/nemo-microservices/evaluate/evaluation-configs.html) in the NeMo microservices documentation.\n",
|
||
"- The `tasks.custom-tool-calling.dataset.files_url` is used to indicate which test file to use. Note that it's required to upload this to the NeMo Data Store and register with Entity store before using.\n",
|
||
"- The `tasks.dataset.limit` argument below specifies how big a subset of test data to run the evaluation on.\n",
|
||
"- The evaluation metric `tasks.metrics.tool-calling-accuracy` reports `function_name_accuracy` and `function_name_and_args_accuracy` numbers, which are as their names imply."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"benchmark_id = \"simple-tool-calling-1\"\n",
|
||
"simple_tool_calling_eval_config = {\n",
|
||
" \"type\": \"custom\",\n",
|
||
" \"tasks\": {\n",
|
||
" \"custom-tool-calling\": {\n",
|
||
" \"type\": \"chat-completion\",\n",
|
||
" \"dataset\": {\n",
|
||
" \"files_url\": f\"hf://datasets/{NMS_NAMESPACE}/{DATASET_NAME}/testing/xlam-test-single.jsonl\",\n",
|
||
" \"limit\": 50\n",
|
||
" },\n",
|
||
" \"params\": {\n",
|
||
" \"template\": {\n",
|
||
" \"messages\": \"{{ item.messages | tojson}}\",\n",
|
||
" \"tools\": \"{{ item.tools | tojson }}\",\n",
|
||
" \"tool_choice\": \"auto\"\n",
|
||
" }\n",
|
||
" },\n",
|
||
" \"metrics\": {\n",
|
||
" \"tool-calling-accuracy\": {\n",
|
||
" \"type\": \"tool-calling\",\n",
|
||
" \"params\": {\"tool_calls_ground_truth\": \"{{ item.tool_calls | tojson }}\"}\n",
|
||
" }\n",
|
||
" }\n",
|
||
" }\n",
|
||
" }\n",
|
||
"}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 1.2: Register Benchmark\n",
|
||
"In order to launch an Evaluation Job using the NeMo Evaluator API, we'll first register a benchmark using the configuration defined in the previous cell."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"response = client.benchmarks.register(\n",
|
||
" benchmark_id=benchmark_id,\n",
|
||
" dataset_id=repo_id,\n",
|
||
" scoring_functions=[],\n",
|
||
" metadata=simple_tool_calling_eval_config\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 1.3: Launch Evaluation Job\n",
|
||
"The following code launches an evaluation job. It uses the benchmark defined in the previous cell and targets the base model."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Launch a simple evaluation with the benchmark\n",
|
||
"response = client.eval.run_eval(\n",
|
||
" benchmark_id=benchmark_id,\n",
|
||
" benchmark_config={\n",
|
||
" \"eval_candidate\": {\n",
|
||
" \"type\": \"model\",\n",
|
||
" \"model\": BASE_MODEL,\n",
|
||
" \"sampling_params\": {}\n",
|
||
" }\n",
|
||
" }\n",
|
||
")\n",
|
||
"job_id = response.model_dump()[\"job_id\"]\n",
|
||
"print(f\"Created evaluation job {job_id}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Wait for the job to complete\n",
|
||
"job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 1.4: Review Evaluation Metrics\n",
|
||
"The following code gets the evaluation results for the base evaluation job"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||
"print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"The following code extracts and prints the accuracy scores for the base model."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
" # Extract function name accuracy score\n",
|
||
"aggregated_results = job_results.scores[benchmark_id].aggregated_results\n",
|
||
"base_function_name_accuracy_score = aggregated_results[\"tasks\"][\"custom-tool-calling\"][\"metrics\"][\"tool-calling-accuracy\"][\"scores\"][\"function_name_accuracy\"][\"value\"]\n",
|
||
"base_function_name_and_args_accuracy = aggregated_results[\"tasks\"][\"custom-tool-calling\"][\"metrics\"][\"tool-calling-accuracy\"][\"scores\"][\"function_name_and_args_accuracy\"][\"value\"]\n",
|
||
"\n",
|
||
"print(f\"Base model: function_name_accuracy: {base_function_name_accuracy_score}\")\n",
|
||
"print(f\"Base model: function_name_and_args_accuracy: {base_function_name_and_args_accuracy}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Step 2: Evaluate the LoRA Customized Model\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2.1 Launch Evaluation Job\n",
|
||
"Run another evaluation job with the same benchmark but with the customized model."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"response = client.eval.run_eval(\n",
|
||
" benchmark_id=benchmark_id,\n",
|
||
" benchmark_config={\n",
|
||
" \"eval_candidate\": {\n",
|
||
" \"type\": \"model\",\n",
|
||
" \"model\": CUSTOMIZED_MODEL,\n",
|
||
" \"sampling_params\": {}\n",
|
||
" }\n",
|
||
" }\n",
|
||
")\n",
|
||
"job_id = response.model_dump()[\"job_id\"]\n",
|
||
"print(f\"Created evaluation job {job_id}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Wait for the job to complete\n",
|
||
"job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 2.2 Review Evaluation Metrics"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
|
||
"print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
" # Extract function name accuracy score\n",
|
||
"aggregated_results = job_results.scores[benchmark_id].aggregated_results\n",
|
||
"ft_function_name_accuracy_score = aggregated_results[\"tasks\"][\"custom-tool-calling\"][\"metrics\"][\"tool-calling-accuracy\"][\"scores\"][\"function_name_accuracy\"][\"value\"]\n",
|
||
"ft_function_name_and_args_accuracy = aggregated_results[\"tasks\"][\"custom-tool-calling\"][\"metrics\"][\"tool-calling-accuracy\"][\"scores\"][\"function_name_and_args_accuracy\"][\"value\"]\n",
|
||
"\n",
|
||
"print(f\"Custom model: function_name_accuracy: {ft_function_name_accuracy_score}\")\n",
|
||
"print(f\"Custom model: function_name_and_args_accuracy: {ft_function_name_and_args_accuracy}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"A successfully fine-tuned `meta/llama-3.2-1b-instruct` results in a significant increase in tool calling accuracy with.\n",
|
||
"\n",
|
||
"In this case you should observe roughly the following improvements -\n",
|
||
"- `function_name_accuracy`: 12% to 92%\n",
|
||
"- `function_name_and_args_accuracy`: 8% to 72%\n",
|
||
"\n",
|
||
"Since this evaluation was on a limited number of samples for demonstration purposes, you may choose to increase `tasks.dataset.limit` in your benchmark `simple_tool_calling_eval_config`."
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.2"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|