diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 567110829..6b5793119 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6347,7 +6347,36 @@ "default": "model" }, "model": { - "type": "string", + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + ], "description": "The model ID to evaluate." }, "sampling_params": { @@ -6362,8 +6391,7 @@ "additionalProperties": false, "required": [ "type", - "model", - "sampling_params" + "model" ], "title": "ModelCandidate", "description": "A model candidate for evaluation." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1dfd17f55..8c5746900 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4468,7 +4468,17 @@ components: const: model default: model model: - type: string + oneOf: + - type: string + - type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object description: The model ID to evaluate. sampling_params: $ref: '#/components/schemas/SamplingParams' @@ -4482,7 +4492,6 @@ components: required: - type - model - - sampling_params title: ModelCandidate description: A model candidate for evaluation. RegexParserScoringFnParams: diff --git a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb index e9f2d9a9e..0be64073c 100644 --- a/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb +++ b/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using NVIDIA." + "This notebook contains Llama Stack implementation of a common end-to-end workflow for customizing and evaluating LLMs using the NVIDIA provider." ] }, { @@ -12,7 +12,49 @@ "metadata": {}, "source": [ "## Prerequisites\n", - "- Please reference to setup the NVIDIA platform. " + "First, ensure the NeMo Microservices platform is up and running, including the model downloading step for `meta/llama-3.2-1b-instruct`. See installation instructions: https://aire.gitlab-master-pages.nvidia.com/microservices/documentation/latest/nemo-microservices/latest-internal/set-up/deploy-as-platform/index.html (TODO: Update to public docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, set up your development environment on your machine. From the root of the project, set up your virtual environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "uv sync --extra dev\n", + "uv pip install -e .\n", + "source .venv/bin/activate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build the Llama Stack image using the virtual environment. For local development, set `LLAMA_STACK_DIR` to ensure your local code is use in the image. To use the production version of `llama-stack`, omit `LLAMA_STACK_DIR`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv" ] }, { @@ -22,9 +64,23 @@ "## Setup\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Configure the environment variables for each service.\n", + "\n", + "If needed, update the URLs for each service to point to your deployment.\n", + "- NDS_URL: NeMo Data Store URL\n", + "- NEMO_URL: NeMo Microservices Platform URL\n", + "- NIM_URL: NIM URL\n", + "\n", + "For more infomation about these variables, please reference the [NVIDIA Distro documentation](docs/source/distributions/remote_hosted_distro/nvidia.md)." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -35,13 +91,13 @@ "NEMO_URL = \"https://nmp.int.aire.nvidia.com\"\n", "NIM_URL = \"https://nim.int.aire.nvidia.com\"\n", "\n", - "# Inference env vars\n", - "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n", - "\n", "USER_ID = \"llama-stack-user\"\n", "NAMESPACE = \"default\"\n", - "PROJECT_ID = \"test-project\"\n", - "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v1\"\n", + "PROJECT_ID = \"\"\n", + "CUSTOMIZED_MODEL_DIR = \"jg-test-llama-stack@v2\"\n", + "\n", + "# Inference env vars\n", + "os.environ[\"NVIDIA_BASE_URL\"] = NIM_URL\n", "\n", "# Customizer env vars\n", "os.environ[\"NVIDIA_CUSTOMIZER_URL\"] = NEMO_URL\n", @@ -50,16 +106,16 @@ "os.environ[\"NVIDIA_PROJECT_ID\"] = PROJECT_ID\n", "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = CUSTOMIZED_MODEL_DIR\n", "\n", - "# Guardrails env vars\n", - "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n", - "\n", "# Evaluator env vars\n", - "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n" + "os.environ[\"NVIDIA_EVALUATOR_URL\"] = NEMO_URL\n", + "\n", + "# Guardrails env vars\n", + "os.environ[\"GUARDRAILS_SERVICE_URL\"] = NEMO_URL\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -70,14 +126,14 @@ "from time import sleep, time\n", "from typing import Dict\n", "\n", - "# import aiohttp\n", - "# import requests\n", - "# from huggingface_hub import HfApi\n", + "import aiohttp\n", + "import requests\n", + "from huggingface_hub import HfApi\n", "\n", - "# os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n", - "# os.environ[\"HF_TOKEN\"] = \"token\"\n", + "os.environ[\"HF_ENDPOINT\"] = f\"{NDS_URL}/v1/hf\"\n", + "os.environ[\"HF_TOKEN\"] = \"token\"\n", "\n", - "# hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))" + "hf_api = HfApi(endpoint=os.environ.get(\"HF_ENDPOINT\"), token=os.environ.get(\"HF_TOKEN\"))" ] }, { @@ -115,9 +171,10 @@ " response = client.post_training.job.status(job_uuid=job_id)\n", " job_status = response.status\n", "\n", + " print(f\"Waiting for Customization job {job_id} to finish.\")\n", " print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n", "\n", - " while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n", + " while job_status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n", " sleep(polling_interval)\n", " response = client.post_training.job.status(job_uuid=job_id)\n", " job_status = response.status\n", @@ -133,9 +190,10 @@ " start_time = time()\n", " job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n", "\n", + " print(f\"Waiting for Evaluation job {job_id} to finish.\")\n", " print(f\"Job status: {job_status} after {time() - start_time} seconds.\")\n", "\n", - " while job_status in [JobStatus.scheduled, JobStatus.in_progress]:\n", + " while job_status.status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:\n", " sleep(polling_interval)\n", " job_status = client.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)\n", "\n", @@ -144,14 +202,49 @@ " if time() - start_time > timeout:\n", " raise RuntimeError(f\"Evaluation Job {job_id} took more than {timeout} seconds.\")\n", "\n", - " return job_status\n" + " return job_status\n", + "\n", + "def wait_nim_loads_customized_model(model_id: str, namespace: str, polling_interval: int = 10, timeout: int = 300):\n", + " found = False\n", + " start_time = time()\n", + "\n", + " model_path = f\"{namespace}/{model_id}\"\n", + " print(f\"Checking if NIM has loaded customized model {model_path}.\")\n", + "\n", + " while not found:\n", + " sleep(polling_interval)\n", + "\n", + " response = requests.get(f\"{NIM_URL}/v1/models\")\n", + " if model_path in [model[\"id\"] for model in response.json()[\"data\"]]:\n", + " found = True\n", + " print(f\"Model {model_path} available after {time() - start_time} seconds.\")\n", + " break\n", + " else:\n", + " print(f\"Model {model_path} not available after {time() - start_time} seconds.\")\n", + "\n", + " if not found:\n", + " raise RuntimeError(f\"Model {model_path} not available after {timeout} seconds.\")\n", + "\n", + " assert found, f\"Could not find model {model_path} in the list of available models.\"\n", + " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## TODO: Upload Dataset Using the HuggingFace Client" + "## Upload Dataset Using the HuggingFace Client" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "sample_squad_test_dataset_name = \"squad-test-dataset\"\n", + "namespace = \"default\"\n", + "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\"" ] }, { @@ -159,20 +252,9 @@ "execution_count": 6, "metadata": {}, "outputs": [], - "source": [ - "sample_squad_test_dataset_name = \"jg-llama-stack-sample-squad-data\"\n", - "namespace = \"default\"\n", - "repo_id = f\"{namespace}/{sample_squad_test_dataset_name}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Create the repo\n", - "# hf_api.create_repo(repo_id, repo_type=\"dataset\")" + "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")" ] }, { @@ -182,24 +264,24 @@ "outputs": [], "source": [ "# Upload the files from the local folder\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_data/training\",\n", - "# path_in_repo=\"training\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_data/validation\",\n", - "# path_in_repo=\"validation\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_data/testing\",\n", - "# path_in_repo=\"testing\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )" + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_data/training\",\n", + " path_in_repo=\"training\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")\n", + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_data/validation\",\n", + " path_in_repo=\"validation\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")\n", + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_data/testing\",\n", + " path_in_repo=\"testing\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")" ] }, { @@ -209,7 +291,18 @@ "outputs": [], "source": [ "# Create the dataset\n", - "# response = client.datasets.register(...)" + "# response = client.datasets.register(...)\n", + "response = requests.post(\n", + " url=f\"{NEMO_URL}/v1/datasets\",\n", + " json={\n", + " \"name\": sample_squad_test_dataset_name,\n", + " \"namespace\": namespace,\n", + " \"description\": \"Dataset created from llama-stack e2e notebook\",\n", + " \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_test_dataset_name}\",\n", + " },\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n", + "json.dumps(response.json(), indent=2)" ] }, { @@ -221,7 +314,14 @@ "# Check the files URL\n", "# response = client.datasets.retrieve(repo_id)\n", "# dataset = response.model_dump()\n", - "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"" + "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n", + "response = requests.get(\n", + " url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_test_dataset_name}\",\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n", + "dataset_obj = response.json()\n", + "print(\"Files URL:\", dataset_obj[\"files_url\"])\n", + "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\"" ] }, { @@ -276,22 +376,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Evaluation\n", - "TODO: Implement this section after Evalutor integration is done." + "## Evaluation\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "benchmark_id = \"jg-llama-stack-3\"" + "benchmark_id = \"test-eval-config-1\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -302,36 +401,41 @@ " \"scoring_functions\": [],\n", " \"metadata\": {\n", " \"type\": \"custom\",\n", - " \"params\": {\n", - " \"parallelism\": 8\n", - " },\n", + " \"params\": {\"parallelism\": 8},\n", " \"tasks\": {\n", " \"qa\": {\n", " \"type\": \"completion\",\n", " \"params\": {\n", " \"template\": {\n", " \"prompt\": \"{{prompt}}\",\n", - " \"max_tokens\": 200\n", - " }\n", - " },\n", - " \"dataset\": {\n", - " \"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"\n", + " \"max_tokens\": 20,\n", + " \"temperature\": 0.7,\n", + " \"top_p\": 0.9,\n", + " },\n", " },\n", + " \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n", " \"metrics\": {\n", " \"bleu\": {\n", " \"type\": \"bleu\",\n", - " \"params\": {\n", - " \"references\": [\n", - " \"{{ideal_response}}\"\n", - " ]\n", - " }\n", - " }\n", - " }\n", + " \"params\": {\"references\": [\"{{ideal_response}}\"]},\n", + " },\n", + " \"string-check\": {\n", + " \"type\": \"string-check\",\n", + " \"params\": {\"check\": [\"{{ideal_response | trim}}\", \"equals\", \"{{output_text | trim}}\"]},\n", + " },\n", + " },\n", " }\n", " }\n", " }\n", - "}\n", - "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "response = client.benchmarks.register(\n", " benchmark_id=benchmark_id,\n", " dataset_id=repo_id,\n", @@ -347,32 +451,13 @@ "metadata": {}, "outputs": [], "source": [ - "for benchmark in client.benchmarks.list():\n", - " print(benchmark)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Launch a simple evaluation with the benchmark\n", + "# Launch a simple evaluation with the benchmark\n", "response = client.eval.run_eval(\n", " benchmark_id=benchmark_id,\n", " benchmark_config={\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", - " \"model\": \"meta/llama-3.1-8b-instruct\",\n", - " \"sampling_params\": {\n", - " \"strategy\": {\n", - " \"type\": \"top_p\",\n", - " \"temperature\": 1.0,\n", - " \"top_p\": 0.95,\n", - " },\n", - " \"max_tokens\": 4096,\n", - " \"repeat_penalty\": 1.0,\n", - " },\n", + " \"model\": \"meta/llama-3.1-8b-instruct\"\n", " }\n", " }\n", ")\n", @@ -406,7 +491,7 @@ "outputs": [], "source": [ "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n", - "print(f\"Job results: {job_results.model_dump()}\")" + "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")" ] }, { @@ -416,7 +501,7 @@ "outputs": [], "source": [ "# Extract bleu score and assert it's within range\n", - "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"sentence\"][\"value\"]\n", + "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n", "print(f\"Initial bleu score: {initial_bleu_score}\")\n", "\n", "assert initial_bleu_score >= 2" @@ -429,10 +514,10 @@ "outputs": [], "source": [ "# Extract accuracy and assert it's within range\n", - "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n", + "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n", "print(f\"Initial accuracy: {initial_accuracy_score}\")\n", "\n", - "assert initial_accuracy_score >= 0.5" + "assert initial_accuracy_score >= 0" ] }, { @@ -507,7 +592,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Verify that inference with the new model works\n", + "# Check that inference with the new model works\n", "from llama_stack.apis.models.models import ModelType\n", "\n", "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", @@ -525,7 +610,20 @@ "# sampling_params={\n", "# \"max_tokens\": 50,\n", "# },\n", - "# )" + "# )\n", + "\n", + "res = requests.post(\n", + " url=f\"{NIM_URL}/v1/completions\",\n", + " json={\n", + " \"model\": f\"{namespace}/{CUSTOMIZED_MODEL_DIR}\",\n", + " \"prompt\": sample_prompt,\n", + " \"max_tokens\": 20,\n", + " \"temperature\": 0.7,\n", + " \"top_p\": 0.9,\n", + " },\n", + ")\n", + "assert res.status_code in (200, 201), f\"Status Code {res.status_code} Failed to get adapted model completion {res.text}\"\n", + "json.dumps(res.json(), indent=2)" ] }, { @@ -533,37 +631,37 @@ "metadata": {}, "source": [ "## TODO: Evaluate Customized Model\n", - "Implement this section after Evalutor integration is done, and we can register Customized model in Model Registry." + "Implement this section after we can register Customized model in Model Registry." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## TODO: Upload Chat Dataset\n", - "Implement this section after Data Store integration is done.\n", + "## Upload Chat Dataset\n", "Repeat fine-tuning and evaluation with a chat style dataset, which has a list of `messages` instead of a `prompt` and `completion`." ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "sample_squad_messages_dataset_name = \"jg-llama-stack-sample-squad-messages\"\n", + "sample_squad_messages_dataset_name = \"test-squad-messages-dataset\"\n", "namespace = \"default\"\n", "repo_id = f\"{namespace}/{sample_squad_messages_dataset_name}\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# Create the repo\n", - "# hf_api.create_repo(repo_id, repo_type=\"dataset\")" + "# hf_api.create_repo(repo_id, repo_type=\"dataset\")\n", + "res = hf_api.create_repo(repo_id, repo_type=\"dataset\")" ] }, { @@ -573,24 +671,24 @@ "outputs": [], "source": [ "# Upload the files from the local folder\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_messages/training\",\n", - "# path_in_repo=\"training\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_messages/validation\",\n", - "# path_in_repo=\"validation\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )\n", - "# hf_api.upload_folder(\n", - "# folder_path=\"./tmp/sample_squad_messages/testing\",\n", - "# path_in_repo=\"testing\",\n", - "# repo_id=repo_id,\n", - "# repo_type=\"dataset\",\n", - "# )" + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_messages/training\",\n", + " path_in_repo=\"training\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")\n", + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_messages/validation\",\n", + " path_in_repo=\"validation\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")\n", + "hf_api.upload_folder(\n", + " folder_path=\"./tmp/sample_squad_messages/testing\",\n", + " path_in_repo=\"testing\",\n", + " repo_id=repo_id,\n", + " repo_type=\"dataset\",\n", + ")" ] }, { @@ -600,7 +698,38 @@ "outputs": [], "source": [ "# Create the dataset\n", - "# response = client.datasets.register(...)" + "# response = client.datasets.register(...)\n", + "response = requests.post(\n", + " url=f\"{NEMO_URL}/v1/datasets\",\n", + " json={\n", + " \"name\": sample_squad_messages_dataset_name,\n", + " \"namespace\": namespace,\n", + " \"description\": \"Dataset created from llama-stack e2e notebook\",\n", + " \"files_url\": f\"hf://datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n", + " \"project\": \"default/project-7tLfD8Lt59wFbarFceF3xN\",\n", + " },\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create dataset {response.text}\"\n", + "json.dumps(response.json(), indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the files URL\n", + "# response = client.datasets.retrieve(repo_id)\n", + "# dataset = response.model_dump()\n", + "# assert dataset[\"source\"][\"uri\"] == f\"hf://datasets/{repo_id}\"\n", + "response = requests.get(\n", + " url=f\"{NEMO_URL}/v1/datasets/{namespace}/{sample_squad_messages_dataset_name}\",\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to fetch dataset {response.text}\"\n", + "dataset_obj = response.json()\n", + "print(\"Files URL:\", dataset_obj[\"files_url\"])\n", + "assert dataset_obj[\"files_url\"] == f\"hf://datasets/{repo_id}\"" ] }, { @@ -651,8 +780,151 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Evaluate with chat dataset\n", - "TODO: Implement this section after Evalutor integration is done." + "## Evaluate with chat dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_id = \"test-eval-config-chat-1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Register a benchmark, which creates an Eval Config\n", + "simple_eval_config = {\n", + " \"benchmark_id\": benchmark_id,\n", + " \"dataset_id\": \"\",\n", + " \"scoring_functions\": [],\n", + " \"metadata\": {\n", + " \"type\": \"custom\",\n", + " \"params\": {\"parallelism\": 8},\n", + " \"tasks\": {\n", + " \"qa\": {\n", + " \"type\": \"completion\",\n", + " \"params\": {\n", + " \"template\": {\n", + " \"messages\": [\n", + " {\"role\": \"{{item.messages[0].role}}\", \"content\": \"{{item.messages[0].content}}\"},\n", + " {\"role\": \"{{item.messages[1].role}}\", \"content\": \"{{item.messages[1].content}}\"},\n", + " ],\n", + " \"max_tokens\": 20,\n", + " \"temperature\": 0.7,\n", + " \"top_p\": 0.9,\n", + " },\n", + " },\n", + " \"dataset\": {\"files_url\": f\"hf://datasets/{repo_id}/testing/testing.jsonl\"},\n", + " \"metrics\": {\n", + " \"bleu\": {\n", + " \"type\": \"bleu\",\n", + " \"params\": {\"references\": [\"{{item.messages[2].content | trim}}\"]},\n", + " },\n", + " \"string-check\": {\n", + " \"type\": \"string-check\",\n", + " \"params\": {\"check\": [\"{{item.messages[2].content}}\", \"equals\", \"{{output_text | trim}}\"]},\n", + " },\n", + " },\n", + " }\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.benchmarks.register(\n", + " benchmark_id=benchmark_id,\n", + " dataset_id=repo_id,\n", + " scoring_functions=simple_eval_config[\"scoring_functions\"],\n", + " metadata=simple_eval_config[\"metadata\"]\n", + ")\n", + "print(f\"Created benchmark {benchmark_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Launch a simple evaluation with the benchmark\n", + "response = client.eval.run_eval(\n", + " benchmark_id=benchmark_id,\n", + " benchmark_config={\n", + " \"eval_candidate\": {\n", + " \"type\": \"model\",\n", + " \"model\": \"meta/llama-3.1-8b-instruct\",\n", + " }\n", + " }\n", + ")\n", + "job_id = response.model_dump()[\"job_id\"]\n", + "print(f\"Created evaluation job {job_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Wait for the job to complete\n", + "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Job {job_id} status: {job.status}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n", + "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract bleu score and assert it's within range\n", + "initial_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n", + "print(f\"Initial bleu score: {initial_bleu_score}\")\n", + "\n", + "assert initial_bleu_score >= 12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract accuracy and assert it's within range\n", + "initial_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n", + "print(f\"Initial accuracy: {initial_accuracy_score}\")\n", + "\n", + "assert initial_accuracy_score >= 0.2" ] }, { @@ -668,13 +940,12 @@ "metadata": {}, "outputs": [], "source": [ - "customized_model_name = \"messages-example-model\"\n", - "customized_model_version = \"v2\"\n", + "customized_model_name = \"test-messages-model\"\n", + "customized_model_version = \"v1\"\n", "customized_model_dir = f\"{customized_model_name}@{customized_model_version}\"\n", "os.environ[\"NVIDIA_OUTPUT_MODEL_DIR\"] = customized_model_dir\n", "\n", - "# TODO: We need to re-initialize the client here to pick up the new env vars\n", - "# Should the output model dir instead be a parameter to `supervised_fine_tune`?\n", + "# NOTE: We need to re-initialize the client here so the Post Training API pick up the updated env var\n", "client.initialize()" ] }, @@ -717,12 +988,211 @@ "print(f\"Created job with ID: {job_id}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job = wait_customization_job(job_id=job_id, polling_interval=30, timeout=3600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", + "# client.models.register(\n", + "# model_id=CUSTOMIZED_MODEL_DIR,\n", + "# model_type=ModelType.llm,\n", + "# provider_id=\"nvidia\",\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check that the customized model has been picked up by NIM;\n", + "# We allow up to 5 minutes for the LoRA adapter to be loaded\n", + "wait_nim_loads_customized_model(model_id=customized_model_dir, namespace=namespace)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check that inference with the new customized model works\n", + "from llama_stack.apis.models.models import ModelType\n", + "\n", + "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", + "# client.models.register(\n", + "# model_id=customized_model_dir,\n", + "# model_type=ModelType.llm,\n", + "# provider_id=\"nvidia\",\n", + "# )\n", + "\n", + "# TODO: This won't work until the code above works - errors with model_id not found.\n", + "# response = client.inference.completion(\n", + "# content=\"Complete the sentence using one word: Roses are red, violets are \",\n", + "# stream=False,\n", + "# model_id=f\"default/{customized_model_dir}\",\n", + "# sampling_params={\n", + "# \"max_tokens\": 50,\n", + "# },\n", + "# )\n", + "\n", + "# TODO: Remove this once code above works. Until then, we'll directly call NIM.\n", + "response = requests.post(\n", + " url=f\"{NIM_URL}/v1/chat/completions\",\n", + " json={\n", + " \"model\": f\"{namespace}/{customized_model_dir}\",\n", + " \"messages\": sample_messages,\n", + " \"max_tokens\": 20,\n", + " \"temperature\": 0.7,\n", + " \"top_p\": 0.9,\n", + " },\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to get adapted model completion {response.text}\"\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(response.json()[\"choices\"][0][\"message\"][\"content\"]) > 1" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## TODO: Evaluate Customized Model with chat dataset\n", - "Implement this section after Evalutor integration is done." + "## Evaluate Customized Model with chat dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Launch evaluation for customized model\n", + "\n", + "# TODO: Uncomment after https://github.com/meta-llama/llama-stack/pull/1859 is merged\n", + "# response = client.eval.run_eval(\n", + "# benchmark_id=benchmark_id,\n", + "# benchmark_config={\n", + "# \"eval_candidate\": {\n", + "# \"type\": \"model\",\n", + "# \"model\": \"meta/llama-3.1-8b-instruct\",\n", + "# \"model\": {\n", + "# \"api_endpoint\": {\n", + "# \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n", + "# \"model_id\": f\"{namespace}/{customized_model_dir}\",\n", + "# }\n", + "# },\n", + "# }\n", + "# }\n", + "# )\n", + "# job_id = response.model_dump()[\"job_id\"]\n", + "# print(f\"Created evaluation job {job_id}\")\n", + "\n", + "# TODO: Remove this once code above works. Until then, we'll directly call the Eval API.\n", + "response = requests.post(\n", + " f\"{NEMO_URL}/v1/evaluation/jobs\",\n", + " json={\n", + " \"config\": f\"nvidia/{benchmark_id}\",\n", + " \"target\": {\n", + " \"type\": \"model\",\n", + " \"model\": {\n", + " \"api_endpoint\": {\n", + " \"url\": \"http://nemo-nim-proxy:8000/v1/chat/completions\",\n", + " \"model_id\": f\"{namespace}/{customized_model_dir}\",\n", + " }\n", + " },\n", + " },\n", + " },\n", + ")\n", + "assert response.status_code in (200, 201), f\"Status Code {response.status_code} Failed to create new evaluation target {response.text}\"\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_id = response.json()[\"id\"]\n", + "print(f\"Created evaluation job {job_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job = wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_results = client.eval.jobs.retrieve(benchmark_id=benchmark_id, job_id=job_id)\n", + "print(f\"Job results: {json.dumps(job_results.model_dump(), indent=2)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract bleu score and assert it's within range\n", + "customized_bleu_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"bleu\"][\"scores\"][\"corpus\"][\"value\"]\n", + "print(f\"Customized bleu score: {customized_bleu_score}\")\n", + "\n", + "assert customized_bleu_score >= 40" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract accuracy and assert it's within range\n", + "customized_accuracy_score = job_results.scores[benchmark_id].aggregated_results[\"tasks\"][\"qa\"][\"metrics\"][\"string-check\"][\"scores\"][\"string-check\"][\"value\"]\n", + "print(f\"Customized accuracy: {customized_accuracy_score}\")\n", + "\n", + "assert customized_accuracy_score >= 0.47" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure the customized model evaluation is better than the original model evaluation\n", + "print(f\"customized_bleu_score - initial_bleu_score: {customized_bleu_score - initial_bleu_score}\")\n", + "assert (customized_bleu_score - initial_bleu_score) >= 20\n", + "\n", + "print(f\"customized_accuracy_score - initial_accuracy_score: {customized_accuracy_score - initial_accuracy_score}\")\n", + "assert (customized_accuracy_score - initial_accuracy_score) >= 0.2" ] }, { @@ -757,8 +1227,7 @@ "outputs": [], "source": [ "# Check inference with guardrails\n", - "# TODO: For some reason, `role: \"user\"` returns a 422 error.\n", - "message = {\"role\": \"system\", \"content\": \"You are stupid.\"}\n", + "message = {\"role\": \"role\", \"content\": \"You are stupid.\"}\n", "response = client.safety.run_shield(\n", " messages=[message],\n", " shield_id=shield_id,\n", @@ -769,16 +1238,14 @@ ")\n", "\n", "print(f\"Safety response: {response}\")\n", - "# TODO: We expect Guardrails status to be \"blocked\", but it's actually \"success\"\n", - "# assert response.user_message == \"Sorry I cannot do this.\"" + "assert response.user_message == \"Sorry I cannot do this.\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## TODO: Guardrails Evaluation\n", - "TODO: Implement this section after Evalutor integration is done." + "## Guardrails Evaluation\n" ] } ], @@ -798,7 +1265,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.10.2" } }, "nbformat": 4, diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 0e5959c37..f9bb9a171 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -27,8 +27,8 @@ class ModelCandidate(BaseModel): """ type: Literal["model"] = "model" - model: str - sampling_params: SamplingParams + model: Union[str, Dict[str, Any]] + sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams) system_message: Optional[SystemMessage] = None diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py index 2ef46251e..ca8464f51 100644 --- a/llama_stack/providers/remote/eval/nvidia/eval.py +++ b/llama_stack/providers/remote/eval/nvidia/eval.py @@ -48,13 +48,13 @@ class NVIDIAEvalImpl( async def _evaluator_get(self, path): """Helper for making GET requests to the evaluator service.""" - response = requests.get(url=f"{self.config.evaluator_service_url}/{path}") + response = requests.get(url=f"{self.config.evaluator_service_url}{path}") response.raise_for_status() return response.json() async def _evaluator_post(self, path, data): """Helper for making POST requests to the evaluator service.""" - response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data) + response = requests.post(url=f"{self.config.evaluator_service_url}{path}", json=data) response.raise_for_status() return response.json() diff --git a/llama_stack/providers/remote/post_training/nvidia/post_training.py b/llama_stack/providers/remote/post_training/nvidia/post_training.py index b3653b527..7594d5554 100644 --- a/llama_stack/providers/remote/post_training/nvidia/post_training.py +++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py @@ -408,7 +408,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper): if v is not None } else: - raise NotImplementedError(f"JASH was here Unsupported algorithm config: {algorithm_config}") + raise NotImplementedError(f"Unsupported algorithm config: {algorithm_config}") # Create the customization job response = await self._make_request(