llama-stack-mirror/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb

806 lines
23 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"- Please reference <TODO: Add docs link> to setup the NVIDIA platform. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# NVIDIA URLs\n",
"NDS_URL = \"https://datastore.int.aire.nvidia.com\"\n",
"NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n",
"NIM_URL = \"https://nim.int.aire.nvidia.com\"\n",
"\n",
"# Inference env vars\n",
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
"\n",
"USER_ID = \"llama-stack-user\"\n",
"NAMESPACE = \"default\"\n",
"PROJECT_ID = \"test-project\"\n",
"CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n",
"\n",
"# Customizer env vars\n",
"os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n",
"os.environ[\"NVIDIA_USER_ID\"] = USER_ID\n",
"os.environ[\"NVIDIA_DATASET_NAMESPACE\"] = NAMESPACE\n",
"os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n",
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
"\n",
"# Guardrails env vars\n",
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n",
"\n",
"# Evaluator env vars\n",
"os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"import json\n",
"import os\n",
"import pprint\n",
"from time import sleep, time\n",
"from typing import Dict\n",
"\n",
"# import aiohttp\n",
"# import requests\n",
"# from huggingface_hub import HfApi\n",
"\n",
"# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
"# os.environ[\"HF_TOKEN\"] = \"token\"\n",
"\n",
"# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set Up Llama Stack Client\n",
"Begin by importing the necessary components from Llama Stack's client library:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
"\n",
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
"client.initialize()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Helper functions for waiting on jobs\n",
"from llama_stack.apis.common.job_types import JobStatus\n",
"\n",
"def wait_customization_job(job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
" start_time = time()\n",
"\n",
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
" sleep(polling_interval)\n",
" response = client.post_training.job.status(job_uuid=job_id)\n",
" job_status = response.status\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" if time() - start_time > timeout:\n",
" raise RuntimeError(f\"Customization Job {job_id} took more than {timeout} seconds.\")\n",
" \n",
" return job_status\n",
"\n",
"def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
" start_time = time()\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n",
" sleep(polling_interval)\n",
" job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n",
"\n",
" print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n",
"\n",
" if time() - start_time > timeout:\n",
" raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n",
"\n",
" return job_status\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TODO: Upload Dataset Using the HuggingFace Client"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n",
"namespace = \"default\"\n",
"repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create the repo\n",
"# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Upload the files from the local folder\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_data/training\",\n",
"# path_in_repo=\"training\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_data/validation\",\n",
"# path_in_repo=\"validation\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_data/testing\",\n",
"# path_in_repo=\"testing\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create the dataset\n",
"# response = client.datasets.register(...)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check the files URL\n",
"# response = client.datasets.retrieve(repo_id)\n",
"# dataset = response.model_dump()\n",
"# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pprint\n",
"\n",
"with open(\"./tmp/sample_squad_data/testing/testing.jsonl\", \"r\") as f:\n",
" examples = [json.loads(line) for line in f]\n",
"\n",
"# Get the user prompt from the last example\n",
"sample_prompt = examples[-1][\"prompt\"]\n",
"pprint.pprint(sample_prompt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test inference\n",
"response = client.inference.chat_completion(\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": sample_prompt}\n",
" ],\n",
" model_id=\"meta/llama-3.1-8b-instruct\",\n",
" sampling_params={\n",
" \"max_tokens\": 20,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" }\n",
" }\n",
")\n",
"print(f\"Inference response: {response.completion_message.content}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation\n",
"TODO: Implement this section after Evalutor integration is done."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"benchmark_id = \"jg-llama-stack-3\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Register a benchmark, which creates an Evaluation Config\n",
"simple_eval_config = {\n",
" \"benchmark_id\": benchmark_id,\n",
" \"dataset_id\": \"\",\n",
" \"scoring_functions\": [],\n",
" \"metadata\": {\n",
" \"type\": \"custom\",\n",
" \"params\": {\n",
" \"parallelism\": 8\n",
" },\n",
" \"tasks\": {\n",
" \"qa\": {\n",
" \"type\": \"completion\",\n",
" \"params\": {\n",
" \"template\": {\n",
" \"prompt\": \"{{prompt}}\",\n",
" \"max_tokens\": 200\n",
" }\n",
" },\n",
" \"dataset\": {\n",
" \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n",
" },\n",
" \"metrics\": {\n",
" \"bleu\": {\n",
" \"type\": \"bleu\",\n",
" \"params\": {\n",
" \"references\": [\n",
" \"{{ideal_response}}\"\n",
" ]\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"response = client.benchmarks.register(\n",
" benchmark_id=benchmark_id,\n",
" dataset_id=repo_id,\n",
" scoring_functions=simple_eval_config[\"scoring_functions\"],\n",
" metadata=simple_eval_config[\"metadata\"]\n",
")\n",
"print(f\"Created benchmark {benchmark_id}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for benchmark in client.benchmarks.list():\n",
" print(benchmark)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Launch a simple evaluation with the benchmark\n",
"response = client.eval.run_eval(\n",
" benchmark_id=benchmark_id,\n",
" benchmark_config={\n",
" \"eval_candidate\": {\n",
" \"type\": \"model\",\n",
" \"model\": \"meta/llama-3.1-8b-instruct\",\n",
" \"sampling_params\": {\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 1.0,\n",
" \"top_p\": 0.95,\n",
" },\n",
" \"max_tokens\": 4096,\n",
" \"repeat_penalty\": 1.0,\n",
" },\n",
" }\n",
" }\n",
")\n",
"job_id = response.model_dump()[\"job_id\"]\n",
"print(f\"Created evaluation job {job_id}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait for the job to complete\n",
"job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Job {job_id} status: {job.status}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n",
"print(f\"Job results: {job_results.model_dump()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract bleu score and assert it's within range\n",
"initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n",
"print(f\"Initial bleu score: {initial_bleu_score}\")\n",
"\n",
"assert initial_bleu_score >= 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract accuracy and assert it's within range\n",
"initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n",
"print(f\"Initial accuracy: {initial_accuracy_score}\")\n",
"\n",
"assert initial_accuracy_score >= 0.5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Customization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Start the customization job\n",
"response = client.post_training.supervised_fine_tune(\n",
" job_uuid=\"\",\n",
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" training_config={\n",
" \"n_epochs\": 2,\n",
" \"data_config\": {\n",
" \"batch_size\": 16,\n",
" \"dataset_id\": sample_squad_test_dataset_name,\n",
" },\n",
" \"optimizer_config\": {\n",
" \"lr\": 0.0001,\n",
" }\n",
" },\n",
" algorithm_config={\n",
" \"type\": \"LoRA\",\n",
" \"adapter_dim\": 16,\n",
" \"adapter_dropout\": 0.1,\n",
" \"alpha\": 16,\n",
" # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n",
" \"rank\": 8,\n",
" \"lora_attn_modules\": [],\n",
" \"apply_lora_to_mlp\": True,\n",
" \"apply_lora_to_output\": False\n",
" },\n",
" hyperparam_search_config={},\n",
" logger_config={},\n",
" checkpoint_dir=\"\",\n",
")\n",
"\n",
"job_id = response.job_uuid\n",
"print(f\"Created job with ID: {job_id}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait for the job to complete\n",
"job_status = wait_customization_job(job_id=job_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Job {job_id} status: {job_status}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Verify that inference with the new model works\n",
"from llama_stack.apis.models.models import ModelType\n",
"\n",
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
"# client.models.register(\n",
"# model_id=CUSTOMIZED_MODEL_DIR,\n",
"# model_type=ModelType.llm,\n",
"# provider_id=\"nvidia\",\n",
"# )\n",
"\n",
"# TODO: This won't work until the code above works - errors with model_id not found.\n",
"# response = client.inference.completion(\n",
"# content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"# stream=False,\n",
"# model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
"# sampling_params={\n",
"# \"max_tokens\": 50,\n",
"# },\n",
"# )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TODO: Evaluate Customized Model\n",
"Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TODO: Upload Chat Dataset\n",
"Implement this section after Data Store integration is done.\n",
"Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n",
"namespace = \"default\"\n",
"repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create the repo\n",
"# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Upload the files from the local folder\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_messages/training\",\n",
"# path_in_repo=\"training\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_messages/validation\",\n",
"# path_in_repo=\"validation\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )\n",
"# hf_api.upload_folder(\n",
"# folder_path=\"./tmp/sample_squad_messages/testing\",\n",
"# path_in_repo=\"testing\",\n",
"# repo_id=repo_id,\n",
"# repo_type=\"dataset\",\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create the dataset\n",
"# response = client.datasets.register(...)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference with chat/completions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"./tmp/sample_squad_messages/testing/testing.jsonl\", \"r\") as f:\n",
" examples = [json.loads(line) for line in f]\n",
"\n",
"# get the user and assistant messages from the last example\n",
"sample_messages = examples[-1][\"messages\"][:-1]\n",
"pprint.pprint(sample_messages)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test inference\n",
"response = client.inference.chat_completion(\n",
" messages=sample_messages,\n",
" model_id=\"meta/llama-3.1-8b-instruct\",\n",
" sampling_params={\n",
" \"max_tokens\": 20,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" }\n",
" }\n",
")\n",
"assert response.completion_message.content is not None\n",
"print(f\"Inference response: {response.completion_message.content}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate with chat dataset\n",
"TODO: Implement this section after Evalutor integration is done."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Customization with chat dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"customized_model_name = \"messages-example-model\"\n",
"customized_model_version = \"v2\"\n",
"customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n",
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n",
"\n",
"# TODO: We need to re-initialize the client here to pick up the new env vars\n",
"# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n",
"client.initialize()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = client.post_training.supervised_fine_tune(\n",
" job_uuid=\"\",\n",
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" training_config={\n",
" \"n_epochs\": 2,\n",
" \"data_config\": {\n",
" \"batch_size\": 16,\n",
" \"dataset_id\": sample_squad_messages_dataset_name,\n",
" },\n",
" \"optimizer_config\": {\n",
" \"lr\": 0.0001,\n",
" }\n",
" },\n",
" algorithm_config={\n",
" \"type\": \"LoRA\",\n",
" \"adapter_dim\": 16,\n",
" \"adapter_dropout\": 0.1,\n",
" \"alpha\": 16,\n",
" # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n",
" \"rank\": 8,\n",
" \"lora_attn_modules\": [],\n",
" \"apply_lora_to_mlp\": True,\n",
" \"apply_lora_to_output\": False\n",
" },\n",
" hyperparam_search_config={},\n",
" logger_config={},\n",
" checkpoint_dir=\"\",\n",
")\n",
"\n",
"job_id = response.job_uuid\n",
"print(f\"Created job with ID: {job_id}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TODO: Evaluate Customized Model with chat dataset\n",
"Implement this section after Evalutor integration is done."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Guardrails"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"shield_id = \"self-check\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check inference with guardrails\n",
"# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
"message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n",
"response = client.safety.run_shield(\n",
" messages=[message],\n",
" shield_id=shield_id,\n",
" # TODO: These params aren't used. We should probably update implementation to use these.\n",
" params={\n",
" \"max_tokens\": 150\n",
" }\n",
")\n",
"\n",
"print(f\"Safety response: {response}\")\n",
"# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n",
"# assert response.user_message == \"Sorry I cannot do this.\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TODO: Guardrails Evaluation\n",
"TODO: Implement this section after Evalutor integration is done."
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}