mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-21 03:59:42 +00:00
1162 lines
47 KiB
Text
1162 lines
47 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Prerequisites\n",
|
|
"- Please reference <TODO: Add docs link> to setup the NVIDIA platform. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Setup\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"# NVIDIA URLs\n",
|
|
"NDS_URL = \"\"\n",
|
|
"NEMO_URL = \"\"\n",
|
|
"NIM_URL = \"\"\n",
|
|
"\n",
|
|
"# Inference env vars\n",
|
|
"os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n",
|
|
"\n",
|
|
"USER_ID = \"llama-stack-user\"\n",
|
|
"NAMESPACE = \"default\"\n",
|
|
"PROJECT_ID = \"test-project\"\n",
|
|
"CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n",
|
|
"\n",
|
|
"# Customizer env vars\n",
|
|
"os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n",
|
|
"os.environ[\"NVIDIA_USER_ID\"] = USER_ID\n",
|
|
"os.environ[\"NVIDIA_DATASET_NAMESPACE\"] = NAMESPACE\n",
|
|
"os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n",
|
|
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n",
|
|
"\n",
|
|
"# Guardrails env vars\n",
|
|
"os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import asyncio\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"import pprint\n",
|
|
"from time import sleep, time\n",
|
|
"from typing import Dict\n",
|
|
"\n",
|
|
"import aiohttp\n",
|
|
"import requests\n",
|
|
"from huggingface_hub import HfApi\n",
|
|
"\n",
|
|
"os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n",
|
|
"os.environ[\"HF_TOKEN\"] = \"token\"\n",
|
|
"\n",
|
|
"hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set Up Llama Stack Client\n",
|
|
"Begin by importing the necessary components from Llama Stack's client library:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Warning: `bwrap` is not available. Code interpreter tool will not work correctly.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initializing NVIDIASafetyAdapter(https://nmp.int.aire.nvidia.com)...\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Using config <span style=\"color: #000080; text-decoration-color: #000080\">nvidia</span>:\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"Using config \u001b[34mnvidia\u001b[0m:\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">apis:\n",
|
|
"- agents\n",
|
|
"- datasetio\n",
|
|
"- eval\n",
|
|
"- inference\n",
|
|
"- post_training\n",
|
|
"- safety\n",
|
|
"- scoring\n",
|
|
"- telemetry\n",
|
|
"- tool_runtime\n",
|
|
"- vector_io\n",
|
|
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
|
"container_image: null\n",
|
|
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
|
"image_name: nvidia\n",
|
|
"logging: null\n",
|
|
"metadata_store:\n",
|
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
"models:\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama3-8b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama3-70b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-70B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-FP8\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-1b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3b-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11b-vision-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
"- metadata: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90b-vision-instruct\n",
|
|
"- metadata:\n",
|
|
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8192</span>\n",
|
|
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2048</span>\n",
|
|
" model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-nv-embedqa-1b-v2\n",
|
|
"- metadata:\n",
|
|
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
|
|
" model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
"- metadata:\n",
|
|
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4096</span>\n",
|
|
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
"- metadata:\n",
|
|
" context_length: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">512</span>\n",
|
|
" embedding_dimension: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1024</span>\n",
|
|
" model_id: snowflake/arctic-embed-l\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: snowflake/arctic-embed-l\n",
|
|
"providers:\n",
|
|
" agents:\n",
|
|
" - config:\n",
|
|
" persistence_store:\n",
|
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">agents_store.db</span>\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" datasetio:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">localfs_datasetio.db</span>\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: localfs\n",
|
|
" provider_type: inline::localfs\n",
|
|
" eval:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">meta_reference_eval.db</span>\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" inference:\n",
|
|
" - config:\n",
|
|
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
|
|
" url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nim.int.aire.nvidia.com</span>\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" post_training:\n",
|
|
" - config:\n",
|
|
" api_key: <span style=\"color: #008000; text-decoration-color: #008000\">'********'</span>\n",
|
|
" customizer_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
|
|
" dataset_namespace: default\n",
|
|
" project_id: test-project\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" safety:\n",
|
|
" - config:\n",
|
|
" config_id: self-check\n",
|
|
" guardrails_service_url: <span style=\"color: #0000ff; text-decoration-color: #0000ff; text-decoration: underline\">https://nmp.int.aire.nvidia.com</span>\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" scoring:\n",
|
|
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" provider_id: basic\n",
|
|
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::ba</span>sic\n",
|
|
" telemetry:\n",
|
|
" - config:\n",
|
|
" service_name: <span style=\"color: #008000; text-decoration-color: #008000\">\"\\u200B\"</span>\n",
|
|
" sinks: sqlite\n",
|
|
" sqlite_db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">trace_store.db</span>\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" tool_runtime:\n",
|
|
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
|
" provider_id: rag-runtime\n",
|
|
" provider_type: inline::rag-runtime\n",
|
|
" vector_io:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/jgulabrai/.llama/distributions/nvidia/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">faiss_store.db</span>\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: faiss\n",
|
|
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::fa</span>iss\n",
|
|
"scoring_fns: <span style=\"font-weight: bold\">[]</span>\n",
|
|
"server:\n",
|
|
" auth: null\n",
|
|
" port: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8321</span>\n",
|
|
" tls_certfile: null\n",
|
|
" tls_keyfile: null\n",
|
|
"shields: <span style=\"font-weight: bold\">[]</span>\n",
|
|
"tool_groups:\n",
|
|
"- args: null\n",
|
|
" mcp_endpoint: null\n",
|
|
" provider_id: rag-runtime\n",
|
|
" toolgroup_id: builtin::rag\n",
|
|
"vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
|
|
"version: <span style=\"color: #008000; text-decoration-color: #008000\">'2'</span>\n",
|
|
"\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"apis:\n",
|
|
"- agents\n",
|
|
"- datasetio\n",
|
|
"- eval\n",
|
|
"- inference\n",
|
|
"- post_training\n",
|
|
"- safety\n",
|
|
"- scoring\n",
|
|
"- telemetry\n",
|
|
"- tool_runtime\n",
|
|
"- vector_io\n",
|
|
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"container_image: null\n",
|
|
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"image_name: nvidia\n",
|
|
"logging: null\n",
|
|
"metadata_store:\n",
|
|
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
"models:\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama3-8b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-8B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-8b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama3-70b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-70B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama3-70b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-1B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
"- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - llm\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n",
|
|
"- metadata:\n",
|
|
" context_length: \u001b[1;36m8192\u001b[0m\n",
|
|
" embedding_dimension: \u001b[1;36m2048\u001b[0m\n",
|
|
" model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n",
|
|
"- metadata:\n",
|
|
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
|
|
" model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/nv-embedqa-e5-v5\n",
|
|
"- metadata:\n",
|
|
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
" embedding_dimension: \u001b[1;36m4096\u001b[0m\n",
|
|
" model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
|
|
"- metadata:\n",
|
|
" context_length: \u001b[1;36m512\u001b[0m\n",
|
|
" embedding_dimension: \u001b[1;36m1024\u001b[0m\n",
|
|
" model_id: snowflake/arctic-embed-l\n",
|
|
" model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
|
|
" - embedding\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_model_id: snowflake/arctic-embed-l\n",
|
|
"providers:\n",
|
|
" agents:\n",
|
|
" - config:\n",
|
|
" persistence_store:\n",
|
|
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95magents_store.db\u001b[0m\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" datasetio:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mlocalfs_datasetio.db\u001b[0m\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: localfs\n",
|
|
" provider_type: inline::localfs\n",
|
|
" eval:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mmeta_reference_eval.db\u001b[0m\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" inference:\n",
|
|
" - config:\n",
|
|
" api_key: \u001b[32m'********'\u001b[0m\n",
|
|
" url: \u001b[4;94mhttps://nim.int.aire.nvidia.com\u001b[0m\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" post_training:\n",
|
|
" - config:\n",
|
|
" api_key: \u001b[32m'********'\u001b[0m\n",
|
|
" customizer_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
|
|
" dataset_namespace: default\n",
|
|
" project_id: test-project\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" safety:\n",
|
|
" - config:\n",
|
|
" config_id: self-check\n",
|
|
" guardrails_service_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n",
|
|
" provider_id: nvidia\n",
|
|
" provider_type: remote::nvidia\n",
|
|
" scoring:\n",
|
|
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" provider_id: basic\n",
|
|
" provider_type: inlin\u001b[1;92me::ba\u001b[0msic\n",
|
|
" telemetry:\n",
|
|
" - config:\n",
|
|
" service_name: \u001b[32m\"\\u200B\"\u001b[0m\n",
|
|
" sinks: sqlite\n",
|
|
" sqlite_db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mtrace_store.db\u001b[0m\n",
|
|
" provider_id: meta-reference\n",
|
|
" provider_type: inline::meta-reference\n",
|
|
" tool_runtime:\n",
|
|
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
" provider_id: rag-runtime\n",
|
|
" provider_type: inline::rag-runtime\n",
|
|
" vector_io:\n",
|
|
" - config:\n",
|
|
" kvstore:\n",
|
|
" db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mfaiss_store.db\u001b[0m\n",
|
|
" namespace: null\n",
|
|
" type: sqlite\n",
|
|
" provider_id: faiss\n",
|
|
" provider_type: inlin\u001b[1;92me::fa\u001b[0miss\n",
|
|
"scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"server:\n",
|
|
" auth: null\n",
|
|
" port: \u001b[1;36m8321\u001b[0m\n",
|
|
" tls_certfile: null\n",
|
|
" tls_keyfile: null\n",
|
|
"shields: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"tool_groups:\n",
|
|
"- args: null\n",
|
|
" mcp_endpoint: null\n",
|
|
" provider_id: rag-runtime\n",
|
|
" toolgroup_id: builtin::rag\n",
|
|
"vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"version: \u001b[32m'2'\u001b[0m\n",
|
|
"\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
|
"\n",
|
|
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
|
"client.initialize()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## TODO: Upload Dataset Using the HuggingFace Client"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sample_squad_test_dataset_name = \"jg-llama-stack\"\n",
|
|
"namespace = \"default\"\n",
|
|
"repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create the repo\n",
|
|
"# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Upload the files from the local folder\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_data/training\",\n",
|
|
"# path_in_repo=\"training\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_data/validation\",\n",
|
|
"# path_in_repo=\"validation\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_data/testing\",\n",
|
|
"# path_in_repo=\"testing\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create the dataset\n",
|
|
"# response = client.datasets.register(...)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check the files URL\n",
|
|
"# response = client.datasets.retrieve(repo_id)\n",
|
|
"# dataset = response.model_dump()\n",
|
|
"# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Inference"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import pprint\n",
|
|
"\n",
|
|
"with open(\"./tmp/sample_squad_data/testing/testing.jsonl\", \"r\") as f:\n",
|
|
" examples = [json.loads(line) for line in f]\n",
|
|
"\n",
|
|
"# Get the user prompt from the last example\n",
|
|
"sample_prompt = examples[-1][\"prompt\"]\n",
|
|
"pprint.pprint(sample_prompt)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test inference\n",
|
|
"response = client.inference.chat_completion(\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"user\", \"content\": sample_prompt}\n",
|
|
" ],\n",
|
|
" model_id=\"meta/llama-3.1-8b-instruct\",\n",
|
|
" sampling_params={\n",
|
|
" \"max_tokens\": 20,\n",
|
|
" \"strategy\": {\n",
|
|
" \"type\": \"top_p\",\n",
|
|
" \"temperature\": 0.7,\n",
|
|
" \"top_p\": 0.9\n",
|
|
" }\n",
|
|
" }\n",
|
|
")\n",
|
|
"print(f\"Inference response: {response.completion_message.content}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Evaluation\n",
|
|
"TODO: Implement this section after Evalutor integration is done."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Customization"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Start the customization job\n",
|
|
"response = client.post_training.supervised_fine_tune(\n",
|
|
" job_uuid=\"\",\n",
|
|
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
|
" training_config={\n",
|
|
" \"n_epochs\": 2,\n",
|
|
" \"data_config\": {\n",
|
|
" \"batch_size\": 16,\n",
|
|
" \"dataset_id\": sample_squad_test_dataset_name,\n",
|
|
" },\n",
|
|
" \"optimizer_config\": {\n",
|
|
" \"lr\": 0.0001,\n",
|
|
" }\n",
|
|
" },\n",
|
|
" algorithm_config={\n",
|
|
" \"type\": \"LoRA\",\n",
|
|
" \"adapter_dim\": 16,\n",
|
|
" \"adapter_dropout\": 0.1,\n",
|
|
" \"alpha\": 16,\n",
|
|
" # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n",
|
|
" \"rank\": 8,\n",
|
|
" \"lora_attn_modules\": [],\n",
|
|
" \"apply_lora_to_mlp\": True,\n",
|
|
" \"apply_lora_to_output\": False\n",
|
|
" },\n",
|
|
" hyperparam_search_config={},\n",
|
|
" logger_config={},\n",
|
|
" checkpoint_dir=\"\",\n",
|
|
")\n",
|
|
"\n",
|
|
"job_id = response.job_uuid\n",
|
|
"print(f\"Created job with ID: {job_id}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Customized model isn't available in the list of models, so this check doesn't work.\n",
|
|
"# customized_model = client.models.retrieve(f\"{NAMESPACE}/{CUSTOMIZED_MODEL_DIR}\")\n",
|
|
"# assert customized_model is not None\n",
|
|
"job_status = client.post_training.job.status(job_uuid=job_id)\n",
|
|
"print(f\"Job status: {job_status.status}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# TODO: This doesn't work - errors with model_id not found.\n",
|
|
"# Depends on https://github.com/meta-llama/llama-stack/pull/1859\n",
|
|
"# Verify that inference with the new model works\n",
|
|
"\n",
|
|
"from llama_stack.apis.models.models import ModelType\n",
|
|
"\n",
|
|
"# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n",
|
|
"# client.models.register(\n",
|
|
"# model_id=CUSTOMIZED_MODEL_DIR,\n",
|
|
"# model_type=ModelType.llm,\n",
|
|
"# provider_id=\"nvidia\",\n",
|
|
"# )\n",
|
|
"\n",
|
|
"response = client.inference.completion(\n",
|
|
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
|
" stream=False,\n",
|
|
" model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n",
|
|
" sampling_params={\n",
|
|
" \"max_tokens\": 50,\n",
|
|
" },\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## TODO: Evaluate Customized Model\n",
|
|
"Implement this section after Evalutor integration is done."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## TODO: Upload Chat Dataset\n",
|
|
"Implement this section after Data Store integration is done.\n",
|
|
"Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n",
|
|
"namespace = \"default\"\n",
|
|
"repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create the repo\n",
|
|
"# hf_api.create_repo(repo_id, repo_type=\"dataset\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Upload the files from the local folder\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_messages/training\",\n",
|
|
"# path_in_repo=\"training\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_messages/validation\",\n",
|
|
"# path_in_repo=\"validation\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )\n",
|
|
"# hf_api.upload_folder(\n",
|
|
"# folder_path=\"./tmp/sample_squad_messages/testing\",\n",
|
|
"# path_in_repo=\"testing\",\n",
|
|
"# repo_id=repo_id,\n",
|
|
"# repo_type=\"dataset\",\n",
|
|
"# )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create the dataset\n",
|
|
"# response = client.datasets.register(...)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Inference with chat/completions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"./tmp/sample_squad_messages/testing/testing.jsonl\", \"r\") as f:\n",
|
|
" examples = [json.loads(line) for line in f]\n",
|
|
"\n",
|
|
"# get the user and assistant messages from the last example\n",
|
|
"sample_messages = examples[-1][\"messages\"][:-1]\n",
|
|
"pprint.pprint(sample_messages)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test inference\n",
|
|
"response = client.inference.chat_completion(\n",
|
|
" messages=sample_messages,\n",
|
|
" model_id=\"meta/llama-3.1-8b-instruct\",\n",
|
|
" sampling_params={\n",
|
|
" \"max_tokens\": 20,\n",
|
|
" \"strategy\": {\n",
|
|
" \"type\": \"top_p\",\n",
|
|
" \"temperature\": 0.7,\n",
|
|
" \"top_p\": 0.9\n",
|
|
" }\n",
|
|
" }\n",
|
|
")\n",
|
|
"assert response.completion_message.content is not None\n",
|
|
"print(f\"Inference response: {response.completion_message.content}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Evaluate with chat dataset\n",
|
|
"TODO: Implement this section after Evalutor integration is done."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Customization with chat dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"customized_model_name = \"messages-example-model\"\n",
|
|
"customized_model_version = \"v2\"\n",
|
|
"customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n",
|
|
"os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n",
|
|
"\n",
|
|
"# TODO: We need to re-initialize the client here to pick up the new env vars\n",
|
|
"# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n",
|
|
"client.initialize()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"response = client.post_training.supervised_fine_tune(\n",
|
|
" job_uuid=\"\",\n",
|
|
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
|
" training_config={\n",
|
|
" \"n_epochs\": 2,\n",
|
|
" \"data_config\": {\n",
|
|
" \"batch_size\": 16,\n",
|
|
" \"dataset_id\": sample_squad_messages_dataset_name,\n",
|
|
" },\n",
|
|
" \"optimizer_config\": {\n",
|
|
" \"lr\": 0.0001,\n",
|
|
" }\n",
|
|
" },\n",
|
|
" algorithm_config={\n",
|
|
" \"type\": \"LoRA\",\n",
|
|
" \"adapter_dim\": 16,\n",
|
|
" \"adapter_dropout\": 0.1,\n",
|
|
" \"alpha\": 16,\n",
|
|
" # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n",
|
|
" \"rank\": 8,\n",
|
|
" \"lora_attn_modules\": [],\n",
|
|
" \"apply_lora_to_mlp\": True,\n",
|
|
" \"apply_lora_to_output\": False\n",
|
|
" },\n",
|
|
" hyperparam_search_config={},\n",
|
|
" logger_config={},\n",
|
|
" checkpoint_dir=\"\",\n",
|
|
")\n",
|
|
"\n",
|
|
"job_id = response.job_uuid\n",
|
|
"print(f\"Created job with ID: {job_id}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## TODO: Evaluate Customized Model with chat dataset\n",
|
|
"Implement this section after Evalutor integration is done."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Guardrails"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"shield_id = \"self-check\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Shield(identifier='self-check', provider_id='nvidia', provider_resource_id='self-check', type='shield', params={}, access_attributes=None)"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'model': 'self-check', 'messages': [{'role': 'system', 'content': 'You are stupid.'}], 'temperature': 1.0, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'max_tokens': 160, 'stream': False, 'guardrails': {'config_id': 'self-check'}}\n",
|
|
"{'status': 'success', 'rails_status': {'self check input': {'status': 'success'}}, 'guardrails_data': {'log': {'activated_rails': [], 'stats': {'total_duration': 0.0007240772247314453, 'llm_calls_duration': 0.0, 'llm_calls_count': 0, 'llm_calls_total_prompt_tokens': 0, 'llm_calls_total_completion_tokens': 0, 'llm_calls_total_tokens': 0}}}}\n",
|
|
"Safety response: RunShieldResponse(violation=None)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Check inference with guardrails\n",
|
|
"# TODO: For some reason, `role: \"user\"` returns a 422 error.\n",
|
|
"message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n",
|
|
"response = client.safety.run_shield(\n",
|
|
" messages=[message],\n",
|
|
" shield_id=shield_id,\n",
|
|
" # TODO: These params aren't used. We should probably update implementation to use these.\n",
|
|
" params={\n",
|
|
" \"max_tokens\": 150\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"print(f\"Safety response: {response}\")\n",
|
|
"# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n",
|
|
"# assert response.user_message == \"Sorry I cannot do this.\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## TODO: Guardrails Evaluation\n",
|
|
"TODO: Implement this section after Evalutor integration is done."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|