diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 567110829..6b5793119 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6347,7 +6347,36 @@
"default": "model"
},
"model": {
- "type": "string",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ ],
"description": "The model ID to evaluate."
},
"sampling_params": {
@@ -6362,8 +6391,7 @@
"additionalProperties": false,
"required": [
"type",
- "model",
- "sampling_params"
+ "model"
],
"title": "ModelCandidate",
"description": "A model candidate for evaluation."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1dfd17f55..8c5746900 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4468,7 +4468,17 @@ components:
const: model
default: model
model:
- type: string
+ oneOf:
+ - type: string
+ - type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
description: The model ID to evaluate.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
@@ -4482,7 +4492,6 @@ components:
required:
- type
- model
- - sampling_params
title: ModelCandidate
description: A model candidate for evaluation.
RegexParserScoringFnParams:
diff --git a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
index e9f2d9a9e..0be64073c 100644
--- a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA."
+ "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using the NVIDIA provider."
]
},
{
@@ -12,7 +12,49 @@
"metadata": {},
"source": [
"## Prerequisites\n",
- "- Please reference to setup the NVIDIA platform. "
+ "First, ensure the NeMo Microservices platform is up and running, including the model downloading step for `meta/llama-3.2-1b-instruct`. See installation instructions: https://aire.gitlab-master-pages.nvidia.com/microservices/documentation/latest/nemo-microservices/latest-internal/set-up/deploy-as-platform/index.html (TODO: Update to public docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, set up your development environment on your machine. From the root of the project, set up your virtual environment:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "shellscript"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "uv sync --extra dev\n",
+ "uv pip install -e .\n",
+ "source .venv/bin/activate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Build the Llama Stack image using the virtual environment. For local development, set `LLAMA_STACK_DIR` to ensure your local code is use in the image. To use the production version of `llama-stack`, omit `LLAMA_STACK_DIR`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "shellscript"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv"
]
},
{
@@ -22,9 +64,23 @@
"## Setup\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Configure the environment variables for each service.\n",
+ "\n",
+ "If needed, update the URLs for each service to point to your deployment.\n",
+ "- NDS_URL: NeMo Data Store URL\n",
+ "- NEMO_URL: NeMo Microservices Platform URL\n",
+ "- NIM_URL: NIM URL\n",
+ "\n",
+ "For more infomation about these variables, please reference the [NVIDIA Distro documentation](docs/source/distributions/remote_hosted_distro/nvidia.md)."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -35,13 +91,13 @@
"NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
"NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
"\n",
- "# Inference env vars\n",
- "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
- "\n",
"USER_ID = \"llama-stack-user\"\n",
"NAMESPACE = \"default\"\n",
- "PROJECT_ID = \"test-project\"\n",
- "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n",
+ "PROJECT_ID = \"\"\n",
+ "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v2\"\n",
+ "\n",
+ "# Inference env vars\n",
+ "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
"\n",
"# Customizer env vars\n",
"os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n",
@@ -50,16 +106,16 @@
"os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n",
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
"\n",
- "# Guardrails env vars\n",
- "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
- "\n",
"# Evaluator env vars\n",
- "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
+ "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n",
+ "\n",
+ "# Guardrails env vars\n",
+ "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -70,14 +126,14 @@
"from time import sleep, time\n",
"from typing import Dict\n",
"\n",
- "# import aiohttp\n",
- "# import requests\n",
- "# from huggingface_hub import HfApi\n",
+ "import aiohttp\n",
+ "import requests\n",
+ "from huggingface_hub import HfApi\n",
"\n",
- "# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
- "# os.environ[\"HF_TOKEN\"] = \"token\"\n",
+ "os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
+ "os.environ[\"HF_TOKEN\"] = \"token\"\n",
"\n",
- "# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
+ "hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
]
},
{
@@ -115,9 +171,10 @@
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
"\n",
+ " print(f\"Waiting for Customization job {job_id} to finish.\")\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
- " while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
+ " while job_status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n",
" sleep(polling_interval)\n",
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
@@ -133,9 +190,10 @@
" start_time = time()\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
+ " print(f\"Waiting for Evaluation job {job_id} to finish.\")\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
- " while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
+ " while job_status.status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n",
" sleep(polling_interval)\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
@@ -144,14 +202,49 @@
" if time() - start_time > timeout:\n",
" raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
"\n",
- " return job_status\n"
+ " return job_status\n",
+ "\n",
+ "def wait_nim_loads_customized_model(model_id: str, namespace: str, polling_interval: int = 10, timeout: int = 300):\n",
+ " found = False\n",
+ " start_time = time()\n",
+ "\n",
+ " model_path = f\"{namespace}/{model_id}\"\n",
+ " print(f\"Checking if NIM has loaded customized model {model_path}.\")\n",
+ "\n",
+ " while not found:\n",
+ " sleep(polling_interval)\n",
+ "\n",
+ " response = requests.get(f\"{NIM_URL}/v1/models\")\n",
+ " if model_path in [model[\"id\"] for model in response.json()[\"data\"]]:\n",
+ " found = True\n",
+ " print(f\"Model {model_path} available after {time() - start_time} seconds.\")\n",
+ " break\n",
+ " else:\n",
+ " print(f\"Model {model_path} not available after {time() - start_time} seconds.\")\n",
+ "\n",
+ " if not found:\n",
+ " raise RuntimeError(f\"Model {model_path} not available after {timeout} seconds.\")\n",
+ "\n",
+ " assert found, f\"Could not find model {model_path} in the list of available models.\"\n",
+ " "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## TODO: Upload Dataset Using the HuggingFace Client"
+ "## Upload Dataset Using the HuggingFace Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_squad_test_dataset_name = \"squad-test-dataset\"\n",
+ "namespace = \"default\"\n",
+ "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
]
},
{
@@ -159,20 +252,9 @@
"execution_count": 6,
"metadata": {},
"outputs": [],
- "source": [
- "sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
- "namespace = \"default\"\n",
- "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
"source": [
"# Create the repo\n",
- "# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
+ "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")"
]
},
{
@@ -182,24 +264,24 @@
"outputs": [],
"source": [
"# Upload the files from the local folder\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_data/training\",\n",
- "# path_in_repo=\"training\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_data/validation\",\n",
- "# path_in_repo=\"validation\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_data/testing\",\n",
- "# path_in_repo=\"testing\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )"
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_data/training\",\n",
+ " path_in_repo=\"training\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")\n",
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_data/validation\",\n",
+ " path_in_repo=\"validation\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")\n",
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_data/testing\",\n",
+ " path_in_repo=\"testing\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")"
]
},
{
@@ -209,7 +291,18 @@
"outputs": [],
"source": [
"# Create the dataset\n",
- "# response = client.datasets.register(...)"
+ "# response = client.datasets.register(...)\n",
+ "response = requests.post(\n",
+ " url=f\"{NEMO_URL}/v1/datasets\",\n",
+ " json={\n",
+ " \"name\": sample_squad_test_dataset_name,\n",
+ " \"namespace\": namespace,\n",
+ " \"description\": \"Dataset created from llama-stack e2e notebook\",\n",
+ " \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_test_dataset_name}\",\n",
+ " },\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n",
+ "json.dumps(response.json(), indent=2)"
]
},
{
@@ -221,7 +314,14 @@
"# Check the files URL\n",
"# response = client.datasets.retrieve(repo_id)\n",
"# dataset = response.model_dump()\n",
- "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\""
+ "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n",
+ "response = requests.get(\n",
+ " url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_test_dataset_name}\",\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n",
+ "dataset_obj = response.json()\n",
+ "print(\"Files URL:\", dataset_obj[\"files_url\"])\n",
+ "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\""
]
},
{
@@ -276,22 +376,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Evaluation\n",
- "TODO: Implement this section after Evalutor integration is done."
+ "## Evaluation\n"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "benchmark_id = \"jg-llama-stack-3\""
+ "benchmark_id = \"test-eval-config-1\""
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -302,36 +401,41 @@
" \"scoring_functions\": [],\n",
" \"metadata\": {\n",
" \"type\": \"custom\",\n",
- " \"params\": {\n",
- " \"parallelism\": 8\n",
- " },\n",
+ " \"params\": {\"parallelism\": 8},\n",
" \"tasks\": {\n",
" \"qa\": {\n",
" \"type\": \"completion\",\n",
" \"params\": {\n",
" \"template\": {\n",
" \"prompt\": \"{{prompt}}\",\n",
- " \"max_tokens\": 200\n",
- " }\n",
- " },\n",
- " \"dataset\": {\n",
- " \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
+ " \"max_tokens\": 20,\n",
+ " \"temperature\": 0.7,\n",
+ " \"top_p\": 0.9,\n",
+ " },\n",
" },\n",
+ " \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n",
" \"metrics\": {\n",
" \"bleu\": {\n",
" \"type\": \"bleu\",\n",
- " \"params\": {\n",
- " \"references\": [\n",
- " \"{{ideal_response}}\"\n",
- " ]\n",
- " }\n",
- " }\n",
- " }\n",
+ " \"params\": {\"references\": [\"{{ideal_response}}\"]},\n",
+ " },\n",
+ " \"string-check\": {\n",
+ " \"type\": \"string-check\",\n",
+ " \"params\": {\"check\": [\"{{ideal_response | trim}}\", \"equals\", \"{{output_text | trim}}\"]},\n",
+ " },\n",
+ " },\n",
" }\n",
" }\n",
" }\n",
- "}\n",
- "\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"response = client.benchmarks.register(\n",
" benchmark_id=benchmark_id,\n",
" dataset_id=repo_id,\n",
@@ -347,32 +451,13 @@
"metadata": {},
"outputs": [],
"source": [
- "for benchmark in client.benchmarks.list():\n",
- " print(benchmark)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Launch a simple evaluation with the benchmark\n",
+ "# Launch a simple evaluation with the benchmark\n",
"response = client.eval.run_eval(\n",
" benchmark_id=benchmark_id,\n",
" benchmark_config={\n",
" \"eval_candidate\": {\n",
" \"type\": \"model\",\n",
- " \"model\": \"meta/llama-3.1-8b-instruct\",\n",
- " \"sampling_params\": {\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 1.0,\n",
- " \"top_p\": 0.95,\n",
- " },\n",
- " \"max_tokens\": 4096,\n",
- " \"repeat_penalty\": 1.0,\n",
- " },\n",
+ " \"model\": \"meta/llama-3.1-8b-instruct\"\n",
" }\n",
" }\n",
")\n",
@@ -406,7 +491,7 @@
"outputs": [],
"source": [
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
- "print(f\"Job results: {job_results.model_dump()}\")"
+ "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
]
},
{
@@ -416,7 +501,7 @@
"outputs": [],
"source": [
"# Extract bleu score and assert it's within range\n",
- "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
+ "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
"print(f\"Initial bleu score: {initial_bleu_score}\")\n",
"\n",
"assert initial_bleu_score >= 2"
@@ -429,10 +514,10 @@
"outputs": [],
"source": [
"# Extract accuracy and assert it's within range\n",
- "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+ "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
"print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
"\n",
- "assert initial_accuracy_score >= 0.5"
+ "assert initial_accuracy_score >= 0"
]
},
{
@@ -507,7 +592,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Verify that inference with the new model works\n",
+ "# Check that inference with the new model works\n",
"from llama_stack.apis.models.models import ModelType\n",
"\n",
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
@@ -525,7 +610,20 @@
"# sampling_params={\n",
"# \"max_tokens\": 50,\n",
"# },\n",
- "# )"
+ "# )\n",
+ "\n",
+ "res = requests.post(\n",
+ " url=f\"{NIM_URL}/v1/completions\",\n",
+ " json={\n",
+ " \"model\": f\"{namespace}/{CUSTOMIZED_MODEL_DIR}\",\n",
+ " \"prompt\": sample_prompt,\n",
+ " \"max_tokens\": 20,\n",
+ " \"temperature\": 0.7,\n",
+ " \"top_p\": 0.9,\n",
+ " },\n",
+ ")\n",
+ "assert res.status_code in (200, 201), f\"Status Code {res.status_code} Failed to get adapted model completion {res.text}\"\n",
+ "json.dumps(res.json(), indent=2)"
]
},
{
@@ -533,37 +631,37 @@
"metadata": {},
"source": [
"## TODO: Evaluate Customized Model\n",
- "Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
+ "Implement this section after we can register Customized model in Model Registry."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## TODO: Upload Chat Dataset\n",
- "Implement this section after Data Store integration is done.\n",
+ "## Upload Chat Dataset\n",
"Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`."
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
- "sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n",
+ "sample_squad_messages_dataset_name = \"test-squad-messages-dataset\"\n",
"namespace = \"default\"\n",
"repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\""
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Create the repo\n",
- "# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
+ "# hf_api.create_repo(repo_id, repo_type=\"dataset\")\n",
+ "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")"
]
},
{
@@ -573,24 +671,24 @@
"outputs": [],
"source": [
"# Upload the files from the local folder\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_messages/training\",\n",
- "# path_in_repo=\"training\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_messages/validation\",\n",
- "# path_in_repo=\"validation\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )\n",
- "# hf_api.upload_folder(\n",
- "# folder_path=\"./tmp/sample_squad_messages/testing\",\n",
- "# path_in_repo=\"testing\",\n",
- "# repo_id=repo_id,\n",
- "# repo_type=\"dataset\",\n",
- "# )"
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_messages/training\",\n",
+ " path_in_repo=\"training\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")\n",
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_messages/validation\",\n",
+ " path_in_repo=\"validation\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")\n",
+ "hf_api.upload_folder(\n",
+ " folder_path=\"./tmp/sample_squad_messages/testing\",\n",
+ " path_in_repo=\"testing\",\n",
+ " repo_id=repo_id,\n",
+ " repo_type=\"dataset\",\n",
+ ")"
]
},
{
@@ -600,7 +698,38 @@
"outputs": [],
"source": [
"# Create the dataset\n",
- "# response = client.datasets.register(...)"
+ "# response = client.datasets.register(...)\n",
+ "response = requests.post(\n",
+ " url=f\"{NEMO_URL}/v1/datasets\",\n",
+ " json={\n",
+ " \"name\": sample_squad_messages_dataset_name,\n",
+ " \"namespace\": namespace,\n",
+ " \"description\": \"Dataset created from llama-stack e2e notebook\",\n",
+ " \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n",
+ " \"project\": \"default/project-7tLfD8Lt59wFbarFceF3xN\",\n",
+ " },\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n",
+ "json.dumps(response.json(), indent=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check the files URL\n",
+ "# response = client.datasets.retrieve(repo_id)\n",
+ "# dataset = response.model_dump()\n",
+ "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n",
+ "response = requests.get(\n",
+ " url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n",
+ "dataset_obj = response.json()\n",
+ "print(\"Files URL:\", dataset_obj[\"files_url\"])\n",
+ "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\""
]
},
{
@@ -651,8 +780,151 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Evaluate with chat dataset\n",
- "TODO: Implement this section after Evalutor integration is done."
+ "## Evaluate with chat dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "benchmark_id = \"test-eval-config-chat-1\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Register a benchmark, which creates an Eval Config\n",
+ "simple_eval_config = {\n",
+ " \"benchmark_id\": benchmark_id,\n",
+ " \"dataset_id\": \"\",\n",
+ " \"scoring_functions\": [],\n",
+ " \"metadata\": {\n",
+ " \"type\": \"custom\",\n",
+ " \"params\": {\"parallelism\": 8},\n",
+ " \"tasks\": {\n",
+ " \"qa\": {\n",
+ " \"type\": \"completion\",\n",
+ " \"params\": {\n",
+ " \"template\": {\n",
+ " \"messages\": [\n",
+ " {\"role\": \"{{item.messages[0].role}}\", \"content\": \"{{item.messages[0].content}}\"},\n",
+ " {\"role\": \"{{item.messages[1].role}}\", \"content\": \"{{item.messages[1].content}}\"},\n",
+ " ],\n",
+ " \"max_tokens\": 20,\n",
+ " \"temperature\": 0.7,\n",
+ " \"top_p\": 0.9,\n",
+ " },\n",
+ " },\n",
+ " \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n",
+ " \"metrics\": {\n",
+ " \"bleu\": {\n",
+ " \"type\": \"bleu\",\n",
+ " \"params\": {\"references\": [\"{{item.messages[2].content | trim}}\"]},\n",
+ " },\n",
+ " \"string-check\": {\n",
+ " \"type\": \"string-check\",\n",
+ " \"params\": {\"check\": [\"{{item.messages[2].content}}\", \"equals\", \"{{output_text | trim}}\"]},\n",
+ " },\n",
+ " },\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = client.benchmarks.register(\n",
+ " benchmark_id=benchmark_id,\n",
+ " dataset_id=repo_id,\n",
+ " scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
+ " metadata=simple_eval_config[\"metadata\"]\n",
+ ")\n",
+ "print(f\"Created benchmark {benchmark_id}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Launch a simple evaluation with the benchmark\n",
+ "response = client.eval.run_eval(\n",
+ " benchmark_id=benchmark_id,\n",
+ " benchmark_config={\n",
+ " \"eval_candidate\": {\n",
+ " \"type\": \"model\",\n",
+ " \"model\": \"meta/llama-3.1-8b-instruct\",\n",
+ " }\n",
+ " }\n",
+ ")\n",
+ "job_id = response.model_dump()[\"job_id\"]\n",
+ "print(f\"Created evaluation job {job_id}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Wait for the job to complete\n",
+ "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Job {job_id} status: {job.status}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
+ "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract bleu score and assert it's within range\n",
+ "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+ "print(f\"Initial bleu score: {initial_bleu_score}\")\n",
+ "\n",
+ "assert initial_bleu_score >= 12"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract accuracy and assert it's within range\n",
+ "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
+ "print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
+ "\n",
+ "assert initial_accuracy_score >= 0.2"
]
},
{
@@ -668,13 +940,12 @@
"metadata": {},
"outputs": [],
"source": [
- "customized_model_name = \"messages-example-model\"\n",
- "customized_model_version = \"v2\"\n",
+ "customized_model_name = \"test-messages-model\"\n",
+ "customized_model_version = \"v1\"\n",
"customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n",
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n",
"\n",
- "# TODO: We need to re-initialize the client here to pick up the new env vars\n",
- "# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n",
+ "# NOTE: We need to re-initialize the client here so the Post Training API pick up the updated env var\n",
"client.initialize()"
]
},
@@ -717,12 +988,211 @@
"print(f\"Created job with ID: {job_id}\")"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job = wait_customization_job(job_id=job_id, polling_interval=30, timeout=3600)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+ "# client.models.register(\n",
+ "# model_id=CUSTOMIZED_MODEL_DIR,\n",
+ "# model_type=ModelType.llm,\n",
+ "# provider_id=\"nvidia\",\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check that the customized model has been picked up by NIM;\n",
+ "# We allow up to 5 minutes for the LoRA adapter to be loaded\n",
+ "wait_nim_loads_customized_model(model_id=customized_model_dir, namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check that inference with the new customized model works\n",
+ "from llama_stack.apis.models.models import ModelType\n",
+ "\n",
+ "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+ "# client.models.register(\n",
+ "# model_id=customized_model_dir,\n",
+ "# model_type=ModelType.llm,\n",
+ "# provider_id=\"nvidia\",\n",
+ "# )\n",
+ "\n",
+ "# TODO: This won't work until the code above works - errors with model_id not found.\n",
+ "# response = client.inference.completion(\n",
+ "# content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
+ "# stream=False,\n",
+ "# model_id=f\"default/{customized_model_dir}\",\n",
+ "# sampling_params={\n",
+ "# \"max_tokens\": 50,\n",
+ "# },\n",
+ "# )\n",
+ "\n",
+ "# TODO: Remove this once code above works. Until then, we'll directly call NIM.\n",
+ "response = requests.post(\n",
+ " url=f\"{NIM_URL}/v1/chat/completions\",\n",
+ " json={\n",
+ " \"model\": f\"{namespace}/{customized_model_dir}\",\n",
+ " \"messages\": sample_messages,\n",
+ " \"max_tokens\": 20,\n",
+ " \"temperature\": 0.7,\n",
+ " \"top_p\": 0.9,\n",
+ " },\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to get adapted model completion {response.text}\"\n",
+ "response.json()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert len(response.json()[\"choices\"][0][\"message\"][\"content\"]) > 1"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## TODO: Evaluate Customized Model with chat dataset\n",
- "Implement this section after Evalutor integration is done."
+ "## Evaluate Customized Model with chat dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Launch evaluation for customized model\n",
+ "\n",
+ "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
+ "# response = client.eval.run_eval(\n",
+ "# benchmark_id=benchmark_id,\n",
+ "# benchmark_config={\n",
+ "# \"eval_candidate\": {\n",
+ "# \"type\": \"model\",\n",
+ "# \"model\": \"meta/llama-3.1-8b-instruct\",\n",
+ "# \"model\": {\n",
+ "# \"api_endpoint\": {\n",
+ "# \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n",
+ "# \"model_id\": f\"{namespace}/{customized_model_dir}\",\n",
+ "# }\n",
+ "# },\n",
+ "# }\n",
+ "# }\n",
+ "# )\n",
+ "# job_id = response.model_dump()[\"job_id\"]\n",
+ "# print(f\"Created evaluation job {job_id}\")\n",
+ "\n",
+ "# TODO: Remove this once code above works. Until then, we'll directly call the Eval API.\n",
+ "response = requests.post(\n",
+ " f\"{NEMO_URL}/v1/evaluation/jobs\",\n",
+ " json={\n",
+ " \"config\": f\"nvidia/{benchmark_id}\",\n",
+ " \"target\": {\n",
+ " \"type\": \"model\",\n",
+ " \"model\": {\n",
+ " \"api_endpoint\": {\n",
+ " \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n",
+ " \"model_id\": f\"{namespace}/{customized_model_dir}\",\n",
+ " }\n",
+ " },\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create new evaluation target {response.text}\"\n",
+ "response.json()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job_id = response.json()[\"id\"]\n",
+ "print(f\"Created evaluation job {job_id}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
+ "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract bleu score and assert it's within range\n",
+ "customized_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
+ "print(f\"Customized bleu score: {customized_bleu_score}\")\n",
+ "\n",
+ "assert customized_bleu_score >= 40"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract accuracy and assert it's within range\n",
+ "customized_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n",
+ "print(f\"Customized accuracy: {customized_accuracy_score}\")\n",
+ "\n",
+ "assert customized_accuracy_score >= 0.47"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Ensure the customized model evaluation is better than the original model evaluation\n",
+ "print(f\"customized_bleu_score - initial_bleu_score: {customized_bleu_score - initial_bleu_score}\")\n",
+ "assert (customized_bleu_score - initial_bleu_score) >= 20\n",
+ "\n",
+ "print(f\"customized_accuracy_score - initial_accuracy_score: {customized_accuracy_score - initial_accuracy_score}\")\n",
+ "assert (customized_accuracy_score - initial_accuracy_score) >= 0.2"
]
},
{
@@ -757,8 +1227,7 @@
"outputs": [],
"source": [
"# Check inference with guardrails\n",
- "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
- "message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n",
+ "message = {\"role\": \"role\", \"content\": \"You are stupid.\"}\n",
"response = client.safety.run_shield(\n",
" messages=[message],\n",
" shield_id=shield_id,\n",
@@ -769,16 +1238,14 @@
")\n",
"\n",
"print(f\"Safety response: {response}\")\n",
- "# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n",
- "# assert response.user_message == \"Sorry I cannot do this.\""
+ "assert response.user_message == \"Sorry I cannot do this.\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## TODO: Guardrails Evaluation\n",
- "TODO: Implement this section after Evalutor integration is done."
+ "## Guardrails Evaluation\n"
]
}
],
@@ -798,7 +1265,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.10"
+ "version": "3.10.2"
}
},
"nbformat": 4,
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 0e5959c37..f9bb9a171 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -27,8 +27,8 @@ class ModelCandidate(BaseModel):
"""
type: Literal["model"] = "model"
- model: str
- sampling_params: SamplingParams
+ model: Union[str, Dict[str, Any]]
+ sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
system_message: Optional[SystemMessage] = None
diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py
index 2ef46251e..ca8464f51 100644
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@@ -48,13 +48,13 @@ class NVIDIAEvalImpl(
async def _evaluator_get(self, path):
"""Helper for making GET requests to the evaluator service."""
- response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
+ response = requests.get(url=f"{self.config.evaluator_service_url}{path}")
response.raise_for_status()
return response.json()
async def _evaluator_post(self, path, data):
"""Helper for making POST requests to the evaluator service."""
- response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
+ response = requests.post(url=f"{self.config.evaluator_service_url}{path}", json=data)
response.raise_for_status()
return response.json()
diff --git a/llama_stack/providers/remote/post_training/nvidia/post_training.py b/llama_stack/providers/remote/post_training/nvidia/post_training.py
index b3653b527..7594d5554 100644
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@@ -408,7 +408,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
if v is not None
}
else:
- raise NotImplementedError(f"JASH was here Unsupported algorithm config: {algorithm_config}")
+ raise NotImplementedError(f"Unsupported algorithm config: {algorithm_config}")
# Create the customization job
response = await self._make_request(