{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prerequisites\n", "- Please reference to setup the NVIDIA platform. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# NVIDIA URLs\n", "NDS_URL = \"\"\n", "NEMO_URL = \"\"\n", "NIM_URL = \"\"\n", "\n", "# Inference env vars\n", "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n", "\n", "USER_ID = \"llama-stack-user\"\n", "NAMESPACE = \"default\"\n", "PROJECT_ID = \"test-project\"\n", "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n", "\n", "# Customizer env vars\n", "os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n", "os.environ[\"NVIDIA_USER_ID\"] = USER_ID\n", "os.environ[\"NVIDIA_DATASET_NAMESPACE\"] = NAMESPACE\n", "os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n", "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n", "\n", "# Guardrails env vars\n", "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import asyncio\n", "import json\n", "import os\n", "import pprint\n", "from time import sleep, time\n", "from typing import Dict\n", "\n", "import aiohttp\n", "import requests\n", "from huggingface_hub import HfApi\n", "\n", "os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n", "os.environ[\"HF_TOKEN\"] = \"token\"\n", "\n", "hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set Up Llama Stack Client\n", "Begin by importing the necessary components from Llama Stack's client library:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning: `bwrap` is not available. Code interpreter tool will not work correctly.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Initializing NVIDIASafetyAdapter(https://nmp.int.aire.nvidia.com)...\n" ] }, { "data": { "text/html": [ "
Using config nvidia:\n",
       "
\n" ], "text/plain": [ "Using config \u001b[34mnvidia\u001b[0m:\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
apis:\n",
       "- agents\n",
       "- datasetio\n",
       "- eval\n",
       "- inference\n",
       "- post_training\n",
       "- safety\n",
       "- scoring\n",
       "- telemetry\n",
       "- tool_runtime\n",
       "- vector_io\n",
       "benchmarks: []\n",
       "container_image: null\n",
       "datasets: []\n",
       "image_name: nvidia\n",
       "logging: null\n",
       "metadata_store:\n",
       "  db_path: /Users/jgulabrai/.llama/distributions/nvidia/registry.db\n",
       "  namespace: null\n",
       "  type: sqlite\n",
       "models:\n",
       "- metadata: {}\n",
       "  model_id: meta/llama3-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-8b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama3-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama3-70b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.1-8b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-8b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.1-8B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-8b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.1-70b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-70b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.1-70B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-70b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.1-405b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-405b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.1-405b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.2-1b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-1b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.2-1B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-1b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.2-3b-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-3b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.2-3B-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-3b-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.2-11b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-11b-vision-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-11b-vision-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta/llama-3.2-90b-vision-instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-90b-vision-instruct\n",
       "- metadata: {}\n",
       "  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - llm\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: meta/llama-3.2-90b-vision-instruct\n",
       "- metadata:\n",
       "    context_length: 8192\n",
       "    embedding_dimension: 2048\n",
       "  model_id: nvidia/llama-3.2-nv-embedqa-1b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2\n",
       "- metadata:\n",
       "    context_length: 512\n",
       "    embedding_dimension: 1024\n",
       "  model_id: nvidia/nv-embedqa-e5-v5\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-e5-v5\n",
       "- metadata:\n",
       "    context_length: 512\n",
       "    embedding_dimension: 4096\n",
       "  model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n",
       "- metadata:\n",
       "    context_length: 512\n",
       "    embedding_dimension: 1024\n",
       "  model_id: snowflake/arctic-embed-l\n",
       "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
       "  - embedding\n",
       "  provider_id: nvidia\n",
       "  provider_model_id: snowflake/arctic-embed-l\n",
       "providers:\n",
       "  agents:\n",
       "  - config:\n",
       "      persistence_store:\n",
       "        db_path: /Users/jgulabrai/.llama/distributions/nvidia/agents_store.db\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  datasetio:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: /Users/jgulabrai/.llama/distributions/nvidia/localfs_datasetio.db\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: localfs\n",
       "    provider_type: inline::localfs\n",
       "  eval:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: /Users/jgulabrai/.llama/distributions/nvidia/meta_reference_eval.db\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  inference:\n",
       "  - config:\n",
       "      api_key: '********'\n",
       "      url: https://nim.int.aire.nvidia.com\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  post_training:\n",
       "  - config:\n",
       "      api_key: '********'\n",
       "      customizer_url: https://nmp.int.aire.nvidia.com\n",
       "      dataset_namespace: default\n",
       "      project_id: test-project\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  safety:\n",
       "  - config:\n",
       "      config_id: self-check\n",
       "      guardrails_service_url: https://nmp.int.aire.nvidia.com\n",
       "    provider_id: nvidia\n",
       "    provider_type: remote::nvidia\n",
       "  scoring:\n",
       "  - config: {}\n",
       "    provider_id: basic\n",
       "    provider_type: inline::basic\n",
       "  telemetry:\n",
       "  - config:\n",
       "      service_name: \"\\u200B\"\n",
       "      sinks: sqlite\n",
       "      sqlite_db_path: /Users/jgulabrai/.llama/distributions/nvidia/trace_store.db\n",
       "    provider_id: meta-reference\n",
       "    provider_type: inline::meta-reference\n",
       "  tool_runtime:\n",
       "  - config: {}\n",
       "    provider_id: rag-runtime\n",
       "    provider_type: inline::rag-runtime\n",
       "  vector_io:\n",
       "  - config:\n",
       "      kvstore:\n",
       "        db_path: /Users/jgulabrai/.llama/distributions/nvidia/faiss_store.db\n",
       "        namespace: null\n",
       "        type: sqlite\n",
       "    provider_id: faiss\n",
       "    provider_type: inline::faiss\n",
       "scoring_fns: []\n",
       "server:\n",
       "  auth: null\n",
       "  port: 8321\n",
       "  tls_certfile: null\n",
       "  tls_keyfile: null\n",
       "shields: []\n",
       "tool_groups:\n",
       "- args: null\n",
       "  mcp_endpoint: null\n",
       "  provider_id: rag-runtime\n",
       "  toolgroup_id: builtin::rag\n",
       "vector_dbs: []\n",
       "version: '2'\n",
       "\n",
       "
\n" ], "text/plain": [ "apis:\n", "- agents\n", "- datasetio\n", "- eval\n", "- inference\n", "- post_training\n", "- safety\n", "- scoring\n", "- telemetry\n", "- tool_runtime\n", "- vector_io\n", "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "container_image: null\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: nvidia\n", "logging: null\n", "metadata_store:\n", " db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mregistry.db\u001b[0m\n", " namespace: null\n", " type: sqlite\n", "models:\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama3-8b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama3-8b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-8B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama3-8b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama3-70b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama3-70b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3\u001b[0m-70B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama3-70b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-8b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-70b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.1\u001b[0m-405b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-1B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-1b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-3b-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-11b-vision-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n", "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - llm\n", " provider_id: nvidia\n", " provider_model_id: meta/llama-\u001b[1;36m3.2\u001b[0m-90b-vision-instruct\n", "- metadata:\n", " context_length: \u001b[1;36m8192\u001b[0m\n", " embedding_dimension: \u001b[1;36m2048\u001b[0m\n", " model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - embedding\n", " provider_id: nvidia\n", " provider_model_id: nvidia/llama-\u001b[1;36m3.2\u001b[0m-nv-embedqa-1b-v2\n", "- metadata:\n", " context_length: \u001b[1;36m512\u001b[0m\n", " embedding_dimension: \u001b[1;36m1024\u001b[0m\n", " model_id: nvidia/nv-embedqa-e5-v5\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - embedding\n", " provider_id: nvidia\n", " provider_model_id: nvidia/nv-embedqa-e5-v5\n", "- metadata:\n", " context_length: \u001b[1;36m512\u001b[0m\n", " embedding_dimension: \u001b[1;36m4096\u001b[0m\n", " model_id: nvidia/nv-embedqa-mistral-7b-v2\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - embedding\n", " provider_id: nvidia\n", " provider_model_id: nvidia/nv-embedqa-mistral-7b-v2\n", "- metadata:\n", " context_length: \u001b[1;36m512\u001b[0m\n", " embedding_dimension: \u001b[1;36m1024\u001b[0m\n", " model_id: snowflake/arctic-embed-l\n", " model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n", " - embedding\n", " provider_id: nvidia\n", " provider_model_id: snowflake/arctic-embed-l\n", "providers:\n", " agents:\n", " - config:\n", " persistence_store:\n", " db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95magents_store.db\u001b[0m\n", " namespace: null\n", " type: sqlite\n", " provider_id: meta-reference\n", " provider_type: inline::meta-reference\n", " datasetio:\n", " - config:\n", " kvstore:\n", " db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mlocalfs_datasetio.db\u001b[0m\n", " namespace: null\n", " type: sqlite\n", " provider_id: localfs\n", " provider_type: inline::localfs\n", " eval:\n", " - config:\n", " kvstore:\n", " db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mmeta_reference_eval.db\u001b[0m\n", " namespace: null\n", " type: sqlite\n", " provider_id: meta-reference\n", " provider_type: inline::meta-reference\n", " inference:\n", " - config:\n", " api_key: \u001b[32m'********'\u001b[0m\n", " url: \u001b[4;94mhttps://nim.int.aire.nvidia.com\u001b[0m\n", " provider_id: nvidia\n", " provider_type: remote::nvidia\n", " post_training:\n", " - config:\n", " api_key: \u001b[32m'********'\u001b[0m\n", " customizer_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n", " dataset_namespace: default\n", " project_id: test-project\n", " provider_id: nvidia\n", " provider_type: remote::nvidia\n", " safety:\n", " - config:\n", " config_id: self-check\n", " guardrails_service_url: \u001b[4;94mhttps://nmp.int.aire.nvidia.com\u001b[0m\n", " provider_id: nvidia\n", " provider_type: remote::nvidia\n", " scoring:\n", " - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " provider_id: basic\n", " provider_type: inlin\u001b[1;92me::ba\u001b[0msic\n", " telemetry:\n", " - config:\n", " service_name: \u001b[32m\"\\u200B\"\u001b[0m\n", " sinks: sqlite\n", " sqlite_db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mtrace_store.db\u001b[0m\n", " provider_id: meta-reference\n", " provider_type: inline::meta-reference\n", " tool_runtime:\n", " - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " provider_id: rag-runtime\n", " provider_type: inline::rag-runtime\n", " vector_io:\n", " - config:\n", " kvstore:\n", " db_path: \u001b[35m/Users/jgulabrai/.llama/distributions/nvidia/\u001b[0m\u001b[95mfaiss_store.db\u001b[0m\n", " namespace: null\n", " type: sqlite\n", " provider_id: faiss\n", " provider_type: inlin\u001b[1;92me::fa\u001b[0miss\n", "scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "server:\n", " auth: null\n", " port: \u001b[1;36m8321\u001b[0m\n", " tls_certfile: null\n", " tls_keyfile: null\n", "shields: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "tool_groups:\n", "- args: null\n", " mcp_endpoint: null\n", " provider_id: rag-runtime\n", " toolgroup_id: builtin::rag\n", "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "version: \u001b[32m'2'\u001b[0m\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n", "\n", "client = LlamaStackAsLibraryClient(\"nvidia\")\n", "client.initialize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO: Upload Dataset Using the HuggingFace Client" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_squad_test_dataset_name = \"jg-llama-stack\"\n", "namespace = \"default\"\n", "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the repo\n", "# hf_api.create_repo(repo_id, repo_type=\"dataset\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload the files from the local folder\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_data/training\",\n", "# path_in_repo=\"training\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_data/validation\",\n", "# path_in_repo=\"validation\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_data/testing\",\n", "# path_in_repo=\"testing\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the dataset\n", "# response = client.datasets.register(...)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check the files URL\n", "# response = client.datasets.retrieve(repo_id)\n", "# dataset = response.model_dump()\n", "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pprint\n", "\n", "with open(\"./tmp/sample_squad_data/testing/testing.jsonl\", \"r\") as f:\n", " examples = [json.loads(line) for line in f]\n", "\n", "# Get the user prompt from the last example\n", "sample_prompt = examples[-1][\"prompt\"]\n", "pprint.pprint(sample_prompt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test inference\n", "response = client.inference.chat_completion(\n", " messages=[\n", " {\"role\": \"user\", \"content\": sample_prompt}\n", " ],\n", " model_id=\"meta/llama-3.1-8b-instruct\",\n", " sampling_params={\n", " \"max_tokens\": 20,\n", " \"strategy\": {\n", " \"type\": \"top_p\",\n", " \"temperature\": 0.7,\n", " \"top_p\": 0.9\n", " }\n", " }\n", ")\n", "print(f\"Inference response: {response.completion_message.content}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation\n", "TODO: Implement this section after Evalutor integration is done." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Customization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Start the customization job\n", "response = client.post_training.supervised_fine_tune(\n", " job_uuid=\"\",\n", " model=\"meta-llama/Llama-3.1-8B-Instruct\",\n", " training_config={\n", " \"n_epochs\": 2,\n", " \"data_config\": {\n", " \"batch_size\": 16,\n", " \"dataset_id\": sample_squad_test_dataset_name,\n", " },\n", " \"optimizer_config\": {\n", " \"lr\": 0.0001,\n", " }\n", " },\n", " algorithm_config={\n", " \"type\": \"LoRA\",\n", " \"adapter_dim\": 16,\n", " \"adapter_dropout\": 0.1,\n", " \"alpha\": 16,\n", " # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n", " \"rank\": 8,\n", " \"lora_attn_modules\": [],\n", " \"apply_lora_to_mlp\": True,\n", " \"apply_lora_to_output\": False\n", " },\n", " hyperparam_search_config={},\n", " logger_config={},\n", " checkpoint_dir=\"\",\n", ")\n", "\n", "job_id = response.job_uuid\n", "print(f\"Created job with ID: {job_id}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Customized model isn't available in the list of models, so this check doesn't work.\n", "# customized_model = client.models.retrieve(f\"{NAMESPACE}/{CUSTOMIZED_MODEL_DIR}\")\n", "# assert customized_model is not None\n", "job_status = client.post_training.job.status(job_uuid=job_id)\n", "print(f\"Job status: {job_status.status}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO: This doesn't work - errors with model_id not found.\n", "# Depends on https://github.com/meta-llama/llama-stack/pull/1859\n", "# Verify that inference with the new model works\n", "\n", "from llama_stack.apis.models.models import ModelType\n", "\n", "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", "# client.models.register(\n", "# model_id=CUSTOMIZED_MODEL_DIR,\n", "# model_type=ModelType.llm,\n", "# provider_id=\"nvidia\",\n", "# )\n", "\n", "response = client.inference.completion(\n", " content=\"Complete the sentence using one word: Roses are red, violets are \",\n", " stream=False,\n", " model_id=f\"default/{CUSTOMIZED_MODEL_DIR}\",\n", " sampling_params={\n", " \"max_tokens\": 50,\n", " },\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO: Evaluate Customized Model\n", "Implement this section after Evalutor integration is done." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO: Upload Chat Dataset\n", "Implement this section after Data Store integration is done.\n", "Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n", "namespace = \"default\"\n", "repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the repo\n", "# hf_api.create_repo(repo_id, repo_type=\"dataset\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload the files from the local folder\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_messages/training\",\n", "# path_in_repo=\"training\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_messages/validation\",\n", "# path_in_repo=\"validation\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )\n", "# hf_api.upload_folder(\n", "# folder_path=\"./tmp/sample_squad_messages/testing\",\n", "# path_in_repo=\"testing\",\n", "# repo_id=repo_id,\n", "# repo_type=\"dataset\",\n", "# )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the dataset\n", "# response = client.datasets.register(...)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inference with chat/completions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"./tmp/sample_squad_messages/testing/testing.jsonl\", \"r\") as f:\n", " examples = [json.loads(line) for line in f]\n", "\n", "# get the user and assistant messages from the last example\n", "sample_messages = examples[-1][\"messages\"][:-1]\n", "pprint.pprint(sample_messages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test inference\n", "response = client.inference.chat_completion(\n", " messages=sample_messages,\n", " model_id=\"meta/llama-3.1-8b-instruct\",\n", " sampling_params={\n", " \"max_tokens\": 20,\n", " \"strategy\": {\n", " \"type\": \"top_p\",\n", " \"temperature\": 0.7,\n", " \"top_p\": 0.9\n", " }\n", " }\n", ")\n", "assert response.completion_message.content is not None\n", "print(f\"Inference response: {response.completion_message.content}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate with chat dataset\n", "TODO: Implement this section after Evalutor integration is done." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Customization with chat dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customized_model_name = \"messages-example-model\"\n", "customized_model_version = \"v2\"\n", "customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n", "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n", "\n", "# TODO: We need to re-initialize the client here to pick up the new env vars\n", "# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n", "client.initialize()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response = client.post_training.supervised_fine_tune(\n", " job_uuid=\"\",\n", " model=\"meta-llama/Llama-3.1-8B-Instruct\",\n", " training_config={\n", " \"n_epochs\": 2,\n", " \"data_config\": {\n", " \"batch_size\": 16,\n", " \"dataset_id\": sample_squad_messages_dataset_name,\n", " },\n", " \"optimizer_config\": {\n", " \"lr\": 0.0001,\n", " }\n", " },\n", " algorithm_config={\n", " \"type\": \"LoRA\",\n", " \"adapter_dim\": 16,\n", " \"adapter_dropout\": 0.1,\n", " \"alpha\": 16,\n", " # NOTE: These fields are required by `AlgorithmConfig` model, but not directly used by NVIDIA\n", " \"rank\": 8,\n", " \"lora_attn_modules\": [],\n", " \"apply_lora_to_mlp\": True,\n", " \"apply_lora_to_output\": False\n", " },\n", " hyperparam_search_config={},\n", " logger_config={},\n", " checkpoint_dir=\"\",\n", ")\n", "\n", "job_id = response.job_uuid\n", "print(f\"Created job with ID: {job_id}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO: Evaluate Customized Model with chat dataset\n", "Implement this section after Evalutor integration is done." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Guardrails" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "shield_id = \"self-check\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Shield(identifier='self-check', provider_id='nvidia', provider_resource_id='self-check', type='shield', params={}, access_attributes=None)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.shields.register(shield_id=shield_id, provider_id=\"nvidia\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'model': 'self-check', 'messages': [{'role': 'system', 'content': 'You are stupid.'}], 'temperature': 1.0, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'max_tokens': 160, 'stream': False, 'guardrails': {'config_id': 'self-check'}}\n", "{'status': 'success', 'rails_status': {'self check input': {'status': 'success'}}, 'guardrails_data': {'log': {'activated_rails': [], 'stats': {'total_duration': 0.0007240772247314453, 'llm_calls_duration': 0.0, 'llm_calls_count': 0, 'llm_calls_total_prompt_tokens': 0, 'llm_calls_total_completion_tokens': 0, 'llm_calls_total_tokens': 0}}}}\n", "Safety response: RunShieldResponse(violation=None)\n" ] } ], "source": [ "# Check inference with guardrails\n", "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n", "message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n", "response = client.safety.run_shield(\n", " messages=[message],\n", " shield_id=shield_id,\n", " # TODO: These params aren't used. We should probably update implementation to use these.\n", " params={\n", " \"max_tokens\": 150\n", " }\n", ")\n", "\n", "print(f\"Safety response: {response}\")\n", "# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n", "# assert response.user_message == \"Sorry I cannot do this.\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO: Guardrails Evaluation\n", "TODO: Implement this section after Evalutor integration is done." ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" } }, "nbformat": 4, "nbformat_minor": 2 }