diff --git a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb new file mode 100644 index 000000000..b504b66ef --- /dev/null +++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb @@ -0,0 +1,1350 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Llama Stack RAG Lifecycle\n", + "\n", + "In this notebook, we will walk through the lifecycle of building and evaluating a RAG pipeline using Llama Stack. \n", + "\n", + "**Example: Torchtune Knowledge Agent** \n", + "\n", + "Throughout this notebook, we will build a knowledge agent that can answer questions about the Torchtune project. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not in Google Colab environment\n" + ] + } + ], + "source": [ + "from llama_stack_client import LlamaStackClient\n", + "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n", + "from llama_stack_client.types.agent_create_params import AgentConfig\n", + "from llama_stack_client.lib.agents.agent import Agent\n", + "from rich.pretty import pprint\n", + "import json\n", + "import uuid\n", + "from pydantic import BaseModel\n", + "import rich\n", + "import os\n", + "try:\n", + " from google.colab import userdata\n", + " os.environ['FIREWORKS_API_KEY'] = userdata.get('FIREWORKS_API_KEY')\n", + "except ImportError:\n", + " print(\"Not in Google Colab environment\")\n", + "\n", + "# client = LlamaStackAsLibraryClient(\"fireworks\", provider_data = {\"fireworks_api_key\": os.environ['FIREWORKS_API_KEY']})\n", + "# _ = client.initialize()\n", + "\n", + "# Uncomment to run on a hosted Llama Stack server\n", + "client = LlamaStackClient(base_url=\"http://localhost:8321\")\n", + "\n", + "MODEL_ID = \"meta-llama/Llama-3.3-70B-Instruct\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Simple Vanilla Agent\n", + "\n", + "First, we will build a simple vanilla agent without any access to external knowledge base or tools, and check how it performs on a couple of questions. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# First, let's come up with a couple of examples to test the agent\n", + "examples = [\n", + " {\n", + " \"input_query\": \"What precision formats does torchtune support?\",\n", + " \"expected_answer\": \"Torchtune supports two data types for precision: fp32 (full-precision) which uses 4 bytes per model and optimizer parameter, and bfloat16 (half-precision) which uses 2 bytes per model and optimizer parameter.\"\n", + " },\n", + " {\n", + " \"input_query\": \"What does DoRA stand for in torchtune?\",\n", + " \"expected_answer\": \"Weight-Decomposed Low-Rank Adaptation\"\n", + " },\n", + " {\n", + " \"input_query\": \"How does the CPUOffloadOptimizer reduce GPU memory usage?\",\n", + " \"expected_answer\": \"The CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It can also optionally offload gradients to CPU by using offload_gradients=True\"\n", + " },\n", + " {\n", + " \"input_query\": \"How do I ensure only LoRA parameters are trainable when fine-tuning?\",\n", + " \"expected_answer\": \"You can set only LoRA parameters to trainable using torchtune's utility functions: first fetch all LoRA parameters with lora_params = get_adapter_params(lora_model), then set them as trainable with set_trainable_params(lora_model, lora_params). The LoRA recipe handles this automatically.\"\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "simple_agent = Agent(client, model=MODEL_ID, instructions=\"You are a helpful assistant that can answer questions about the Torchtune project.\")\n", + "simple_session_id = simple_agent.create_session(session_name=f\"simple_session_{uuid.uuid4()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Question: What precision formats does torchtune support?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: Torchtune supports the following precision formats:\n", + "\n", + "* `fp32` (32-bit floating point)\n", + "* `fp16` (16-bit floating point)\n", + "* `bf16` (Brain Floating Point 16, a 16-bit floating point format)\n", + "* `int8` (8-bit integer)\n", + "\n", + "These precision formats can be used for model weights, activations, and gradients, allowing for flexible \n", + "mixed-precision training and inference.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats:\n", + "\n", + "* `fp32` \u001b[1m(\u001b[0m\u001b[1;36m32\u001b[0m-bit floating point\u001b[1m)\u001b[0m\n", + "* `fp16` \u001b[1m(\u001b[0m\u001b[1;36m16\u001b[0m-bit floating point\u001b[1m)\u001b[0m\n", + "* `bf16` \u001b[1m(\u001b[0mBrain Floating Point \u001b[1;36m16\u001b[0m, a \u001b[1;36m16\u001b[0m-bit floating point format\u001b[1m)\u001b[0m\n", + "* `int8` \u001b[1m(\u001b[0m\u001b[1;36m8\u001b[0m-bit integer\u001b[1m)\u001b[0m\n", + "\n", + "These precision formats can be used for model weights, activations, and gradients, allowing for flexible \n", + "mixed-precision training and inference.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: What does DoRA stand for in torchtune?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: In Torchtune, DoRA stands for \"Dynamic Output Range Awareness\"\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m In Torchtune, DoRA stands for \u001b[32m\"Dynamic Output Range Awareness\"\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: The CPUOffloadOptimizer in Torchtune reduces GPU memory usage by offloading the model's parameters \n",
+ "and gradients to the CPU during the backward pass, freeing up GPU memory. This allows for training larger models \n",
+ "that would otherwise not fit in GPU memory.\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer in Torchtune reduces GPU memory usage by offloading the model's parameters \n",
+ "and gradients to the CPU during the backward pass, freeing up GPU memory. This allows for training larger models \n",
+ "that would otherwise not fit in GPU memory.\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Question: How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: To ensure only LoRA (Low-Rank Adaptation) parameters are trainable when fine-tuning, you can set the \n", + "`requires_grad` attribute of the original model parameters to `False` and the `requires_grad` attribute of the LoRA\n", + "parameters to `True`. This will freeze the original model weights and only update the LoRA parameters during \n", + "fine-tuning.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m parameters are trainable when fine-tuning, you can set the \n", + "`requires_grad` attribute of the original model parameters to `\u001b[3;91mFalse\u001b[0m` and the `requires_grad` attribute of the LoRA\n", + "parameters to `\u001b[3;92mTrue\u001b[0m`. This will freeze the original model weights and only update the LoRA parameters during \n", + "fine-tuning.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for example in examples:\n", + " response = simple_agent.create_turn(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": example[\"input_query\"]\n", + " }\n", + " ],\n", + " session_id=simple_session_id,\n", + " stream=False\n", + " )\n", + " rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n", + " rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.1 Evaluate Agent Responses\n", + "Let's gather up the agent's logs and evaluate the agent's performance. We can see that our agent's response is quite bad and off from the expected answer." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ScoringScoreResponse(\n", + "│ results={\n", + "│ │ 'braintrust::factuality': ScoringResult(\n", + "│ │ │ aggregated_results={'average': {'average': 0.0}},\n", + "│ │ │ score_rows=[\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': \"1. The expert answer states that Torchtune supports two precision formats: fp32 (full-precision) and bfloat16 (half-precision).\\n2. The submitted answer lists three precision formats: Full precision (FP32), Mixed precision (FP16), and Integer precision (INT8).\\n3. The submitted answer mentions FP32, which aligns with the expert answer's mention of fp32.\\n4. The submitted answer does not mention bfloat16, which is included in the expert answer.\\n5. The submitted answer includes FP16 and INT8, which are not mentioned in the expert answer.\\n6. Since the submitted answer includes precision formats not mentioned by the expert and omits bfloat16, there is a disagreement between the two answers regarding the precision formats supported by Torchtune.\\n\\nBased on this analysis, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': '1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Deep Optimization and Rapid Auto-tuning.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the submitted answer conflicts with the expert answer.'\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer**: The expert states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU.\\n\\n2. **Submitted Answer**: The submission describes the CPUOffloadOptimizer as offloading certain model parameters and intermediate results to the CPU, which is different from the expert's focus on optimizer states and steps. The submission does not mention optimizer states or steps but instead focuses on parameters and intermediate results.\\n\\n3. **Comparison**:\\n - The expert's answer focuses on optimizer states and steps being offloaded to the CPU.\\n - The submission focuses on model parameters and intermediate results being offloaded to the CPU.\\n - The submission does not mention the offloading of gradients or optimizer states and steps.\\n\\n4. **Conclusion**: There is a disagreement between the submitted answer and the expert answer. The submission describes a different mechanism (offloading model parameters and intermediate results) than the expert (offloading optimizer states and steps). Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer Analysis**: The expert answer suggests using torchtune's utility functions to set only LoRA parameters as trainable. It involves fetching LoRA parameters using `get_adapter_params(lora_model)` and then setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also mentions that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer suggests using the `trainable_params` argument in Torchtune's `lora` module to ensure only LoRA parameters are trainable. It provides a code example where `trainable_params` is set to `'lora'` when creating the LoRA adapter, which freezes the original model weights and only trains the LoRA parameters.\\n\\n3. **Comparison**:\\n - Both answers aim to achieve the same goal: making only LoRA parameters trainable.\\n - The expert answer provides a method using utility functions `get_adapter_params` and `set_trainable_params`.\\n - The submitted answer provides a method using the `trainable_params` argument directly in the `lora` module.\\n - The methods described in both answers are different, indicating a potential disagreement in the approach.\\n\\n4. **Conclusion**: The submitted answer and the expert answer describe different methods to achieve the same goal, which suggests a disagreement in the approach. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ }\n", + "│ │ │ ]\n", + "│ │ )\n", + "│ }\n", + ")\n", + "\n" + ], + "text/plain": [ + "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. The expert answer states that Torchtune supports two precision formats: fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and bfloat16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n2. The submitted answer lists three precision formats: Full precision \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFP32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Mixed precision \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFP16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and Integer precision \u001b[0m\u001b[32m(\u001b[0m\u001b[32mINT8\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n3. The submitted answer mentions FP32, which aligns with the expert answer's mention of fp32.\\n4. The submitted answer does not mention bfloat16, which is included in the expert answer.\\n5. The submitted answer includes FP16 and INT8, which are not mentioned in the expert answer.\\n6. Since the submitted answer includes precision formats not mentioned by the expert and omits bfloat16, there is a disagreement between the two answers regarding the precision formats supported by Torchtune.\\n\\nBased on this analysis, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Deep Optimization and Rapid Auto-tuning.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the submitted answer conflicts with the expert answer.'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer**: The expert states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU.\\n\\n2. **Submitted Answer**: The submission describes the CPUOffloadOptimizer as offloading certain model parameters and intermediate results to the CPU, which is different from the expert's focus on optimizer states and steps. The submission does not mention optimizer states or steps but instead focuses on parameters and intermediate results.\\n\\n3. **Comparison**:\\n - The expert's answer focuses on optimizer states and steps being offloaded to the CPU.\\n - The submission focuses on model parameters and intermediate results being offloaded to the CPU.\\n - The submission does not mention the offloading of gradients or optimizer states and steps.\\n\\n4. **Conclusion**: There is a disagreement between the submitted answer and the expert answer. The submission describes a different mechanism \u001b[0m\u001b[32m(\u001b[0m\u001b[32moffloading model parameters and intermediate results\u001b[0m\u001b[32m)\u001b[0m\u001b[32m than the expert \u001b[0m\u001b[32m(\u001b[0m\u001b[32moffloading optimizer states and steps\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer suggests using torchtune's utility functions to set only LoRA parameters as trainable. It involves fetching LoRA parameters using `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and then setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also mentions that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer suggests using the `trainable_params` argument in Torchtune's `lora` module to ensure only LoRA parameters are trainable. It provides a code example where `trainable_params` is set to `'lora'` when creating the LoRA adapter, which freezes the original model weights and only trains the LoRA parameters.\\n\\n3. **Comparison**:\\n - Both answers aim to achieve the same goal: making only LoRA parameters trainable.\\n - The expert answer provides a method using utility functions `get_adapter_params` and `set_trainable_params`.\\n - The submitted answer provides a method using the `trainable_params` argument directly in the `lora` module.\\n - The methods described in both answers are different, indicating a potential disagreement in the approach.\\n\\n4. **Conclusion**: The submitted answer and the expert answer describe different methods to achieve the same goal, which suggests a disagreement in the approach. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "eval_rows = []\n", + "session_response = client.agents.session.retrieve(agent_id=simple_agent.agent_id, session_id=simple_session_id)\n", + "for i, turn in enumerate(session_response.turns):\n", + " eval_rows.append({\n", + " \"input_query\": examples[i][\"input_query\"],\n", + " \"expected_answer\": examples[i][\"expected_answer\"],\n", + " \"generated_answer\": turn.output_message.content,\n", + " })\n", + "\n", + "scoring_params = {\n", + " \"braintrust::factuality\": None,\n", + "}\n", + "scoring_response = client.scoring.score(\n", + " input_rows=eval_rows,\n", + " scoring_functions=scoring_params,\n", + ")\n", + "pprint(scoring_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Search Agent\n", + "\n", + "Now, let's see how we can improve the agent's performance by adding a search tool." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "search_agent = Agent(client, \n", + " model=MODEL_ID,\n", + " instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. You should always use the search tool to answer questions.\",\n", + " tools=[\"builtin::websearch\"])\n", + "search_session_id = search_agent.create_session(session_name=f\"search_session_{uuid.uuid4()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Question: What precision formats does torchtune support?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: Torchtune supports the following precision formats:\n", + "\n", + "* bf16 (16-bit floating-point format)\n", + "* fp32 (32-bit floating-point format, also known as \"full-precision\")\n", + "\n", + "It's worth noting that torchtune also supports mixed-precision techniques, which allow for the use of different \n", + "precision formats for different parts of the model or during different stages of training.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats:\n", + "\n", + "* bf16 \u001b[1m(\u001b[0m\u001b[1;36m16\u001b[0m-bit floating-point format\u001b[1m)\u001b[0m\n", + "* fp32 \u001b[1m(\u001b[0m\u001b[1;36m32\u001b[0m-bit floating-point format, also known as \u001b[32m\"full-precision\"\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + "It's worth noting that torchtune also supports mixed-precision techniques, which allow for the use of different \n", + "precision formats for different parts of the model or during different stages of training.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: What does DoRA stand for in torchtune?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: DoRA stands for \"Decoupled Offline Reweighting Adapter\" in torchtune. It is a technique used for \n", + "fine-tuning large language models (LLMs) and is an alternative to LoRA (Low-Rank Adaptation). DoRA is designed to \n", + "be more efficient and effective than LoRA, and it has been shown to achieve high performance on various fine-tuning\n", + "tasks.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for \u001b[32m\"Decoupled Offline Reweighting Adapter\"\u001b[0m in torchtune. It is a technique used for \n", + "fine-tuning large language models \u001b[1m(\u001b[0mLLMs\u001b[1m)\u001b[0m and is an alternative to LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m. DoRA is designed to \n", + "be more efficient and effective than LoRA, and it has been shown to achieve high performance on various fine-tuning\n", + "tasks.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: The CPUOffloadOptimizer reduces GPU memory usage by offloading gradients and optimizer states to the \n", + "CPU, freeing up memory on the GPU. This is done by setting `offload_gradients=True` in the CPUOffloadOptimizer, \n", + "which transfers gradients from the GPU to the CPU once the device-to-host transfer finishes. Additionally, using a \n", + "stateful optimizer with a model with a lot of parameters and not using gradient accumulation can also help minimize\n", + "the slowdown caused by offloading.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading gradients and optimizer states to the \n", + "CPU, freeing up memory on the GPU. This is done by setting `\u001b[33moffload_gradients\u001b[0m=\u001b[3;92mTrue\u001b[0m` in the CPUOffloadOptimizer, \n", + "which transfers gradients from the GPU to the CPU once the device-to-host transfer finishes. Additionally, using a \n", + "stateful optimizer with a model with a lot of parameters and not using gradient accumulation can also help minimize\n", + "the slowdown caused by offloading.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: To ensure only LoRA parameters are trainable when fine-tuning, you can use the `LoRA` class from the \n", + "`transformers` library and set the `trainable` parameter to `True` only for the LoRA parameters. You can also use \n", + "the `freeze` method to freeze the weights of the pre-trained model and only update the LoRA parameters during \n", + "fine-tuning.\n", + "\n", + "Here is an example code snippet:\n", + "```\n", + "from transformers import AutoModelForSequenceClassification, LoRA\n", + "\n", + "# Load pre-trained model and tokenizer\n", + "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "\n", + "# Create LoRA adapter\n", + "lora = LoRA(model, rank=16)\n", + "\n", + "# Freeze pre-trained model weights\n", + "for param in model.parameters():\n", + " param.requires_grad = False\n", + "\n", + "# Set LoRA parameters to trainable\n", + "for param in lora.parameters():\n", + " param.requires_grad = True\n", + "\n", + "# Fine-tune model with LoRA\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = AdamW(lora.parameters(), lr=1e-5)\n", + "\n", + "for epoch in range(5):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in train_dataloader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + " print(f\"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}\")\n", + "```\n", + "In this example, we first load a pre-trained BERT model and create a LoRA adapter with a rank of 16. We then freeze\n", + "the pre-trained model weights and set the LoRA parameters to trainable. Finally, we fine-tune the model with the \n", + "LoRA adapter using the AdamW optimizer and cross-entropy loss.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `LoRA` class from the \n", + "`transformers` library and set the `trainable` parameter to `\u001b[3;92mTrue\u001b[0m` only for the LoRA parameters. You can also use \n", + "the `freeze` method to freeze the weights of the pre-trained model and only update the LoRA parameters during \n", + "fine-tuning.\n", + "\n", + "Here is an example code snippet:\n", + "```\n", + "from transformers import AutoModelForSequenceClassification, LoRA\n", + "\n", + "# Load pre-trained model and tokenizer\n", + "model = \u001b[1;35mAutoModelForSequenceClassification.from_pretrained\u001b[0m\u001b[1m(\u001b[0m\u001b[32m\"bert-base-uncased\"\u001b[0m\u001b[1m)\u001b[0m\n", + "tokenizer = \u001b[1;35mAutoTokenizer.from_pretrained\u001b[0m\u001b[1m(\u001b[0m\u001b[32m\"bert-base-uncased\"\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + "# Create LoRA adapter\n", + "lora = \u001b[1;35mLoRA\u001b[0m\u001b[1m(\u001b[0mmodel, \u001b[33mrank\u001b[0m=\u001b[1;36m16\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + "# Freeze pre-trained model weights\n", + "for param in \u001b[1;35mmodel.parameters\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m:\n", + " param.requires_grad = \u001b[3;91mFalse\u001b[0m\n", + "\n", + "# Set LoRA parameters to trainable\n", + "for param in \u001b[1;35mlora.parameters\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m:\n", + " param.requires_grad = \u001b[3;92mTrue\u001b[0m\n", + "\n", + "# Fine-tune model with LoRA\n", + "device = \u001b[1;35mtorch.device\u001b[0m\u001b[1m(\u001b[0m\u001b[32m\"cuda\"\u001b[0m if \u001b[1;35mtorch.cuda.is_available\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m else \u001b[32m\"cpu\"\u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[1;35mmodel.to\u001b[0m\u001b[1m(\u001b[0mdevice\u001b[1m)\u001b[0m\n", + "criterion = \u001b[1;35mnn.CrossEntropyLoss\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + "optimizer = \u001b[1;35mAdamW\u001b[0m\u001b[1m(\u001b[0m\u001b[1;35mlora.parameters\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33mlr\u001b[0m=\u001b[1;36m1e\u001b[0m\u001b[1;36m-5\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + "for epoch in \u001b[1;35mrange\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1m)\u001b[0m:\n", + " \u001b[1;35mmodel.train\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + " total_loss = \u001b[1;36m0\u001b[0m\n", + " for batch in train_dataloader:\n", + " input_ids = batch\u001b[1m[\u001b[0m\u001b[32m\"input_ids\"\u001b[0m\u001b[1m]\u001b[0m\u001b[1;35m.to\u001b[0m\u001b[1m(\u001b[0mdevice\u001b[1m)\u001b[0m\n", + " attention_mask = batch\u001b[1m[\u001b[0m\u001b[32m\"attention_mask\"\u001b[0m\u001b[1m]\u001b[0m\u001b[1;35m.to\u001b[0m\u001b[1m(\u001b[0mdevice\u001b[1m)\u001b[0m\n", + " labels = batch\u001b[1m[\u001b[0m\u001b[32m\"labels\"\u001b[0m\u001b[1m]\u001b[0m\u001b[1;35m.to\u001b[0m\u001b[1m(\u001b[0mdevice\u001b[1m)\u001b[0m\n", + "\n", + " \u001b[1;35moptimizer.zero_grad\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + " outputs = \u001b[1;35mmodel\u001b[0m\u001b[1m(\u001b[0minput_ids, \u001b[33mattention_mask\u001b[0m=\u001b[35mattention_mask\u001b[0m, \u001b[33mlabels\u001b[0m=\u001b[35mlabels\u001b[0m\u001b[1m)\u001b[0m\n", + " loss = \u001b[1;35mcriterion\u001b[0m\u001b[1m(\u001b[0moutputs, labels\u001b[1m)\u001b[0m\n", + "\n", + " \u001b[1;35mloss.backward\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + " \u001b[1;35moptimizer.step\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + "\n", + " total_loss += \u001b[1;35mloss.item\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", + " \u001b[1;35mprint\u001b[0m\u001b[1m(\u001b[0mf\"Epoch \u001b[1m{\u001b[0mepoch+\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m, Loss: \u001b[1m{\u001b[0mtotal_loss \u001b[35m/\u001b[0m \u001b[1;35mlen\u001b[0m\u001b[1m(\u001b[0mtrain_dataloader\u001b[1m)\u001b[0m\u001b[1m}\u001b[0m\"\u001b[1m)\u001b[0m\n", + "```\n", + "In this example, we first load a pre-trained BERT model and create a LoRA adapter with a rank of \u001b[1;36m16\u001b[0m. We then freeze\n", + "the pre-trained model weights and set the LoRA parameters to trainable. Finally, we fine-tune the model with the \n", + "LoRA adapter using the AdamW optimizer and cross-entropy loss.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for example in examples:\n", + " response = search_agent.create_turn(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": example[\"input_query\"]\n", + " }\n", + " ],\n", + " session_id=search_session_id,\n", + " stream=False\n", + " )\n", + " rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n", + " rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Evaluate Agent Responses\n", + "\n", + "We can see that with a search tool, the agent's performance is much better, and have less hallucinations. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ScoringScoreResponse(\n", + "│ results={\n", + "│ │ 'braintrust::factuality': ScoringResult(\n", + "│ │ │ aggregated_results={'average': {'average': 0.44999999999999996}},\n", + "│ │ │ score_rows=[\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': '1. **Expert Answer Analysis**: The expert answer states that Torchtune supports two precision formats: fp32 (full-precision) and bfloat16 (half-precision).\\n - fp32 is described as using 4 bytes per model and optimizer parameter.\\n - bfloat16 is described as using 2 bytes per model and optimizer parameter.\\n\\n2. **Submitted Answer Analysis**: The submitted answer lists the following precision formats:\\n - bf16 (16-bit floating-point format)\\n - fp32 (32-bit floating-point format, also known as \"full-precision\")\\n - Additionally, it mentions support for mixed-precision techniques.\\n\\n3. **Comparison**:\\n - Both answers mention fp32 and bf16/bfloat16, which are essentially the same formats, though the naming differs slightly (bf16 vs bfloat16).\\n - The submitted answer includes additional information about mixed-precision techniques, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**:\\n - The submitted answer includes all the information from the expert answer and adds more details about mixed-precision techniques.\\n - There is no conflict between the two answers; the submitted answer is more detailed.\\n\\nTherefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': '1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Offline Reweighting Adapter.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for.\\n5. Therefore, the correct choice is that there is a disagreement between the submitted answer and the expert answer.'\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU by setting `offload_gradients=True`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer mentions offloading both gradients and optimizer states to the CPU, which aligns with the expert's mention of keeping optimizer states on the CPU and optionally offloading gradients. It also adds details about freeing up GPU memory, the process of transferring gradients once the device-to-host transfer finishes, and additional strategies like using a stateful optimizer and not using gradient accumulation to minimize slowdown.\\n\\n3. **Comparison**: The submitted answer includes all the points mentioned in the expert answer:\\n - Offloading optimizer states to the CPU.\\n - Optionally offloading gradients to the CPU with `offload_gradients=True`.\\n - It adds more details about the process and additional strategies to minimize slowdown, which are not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and adds more details. Therefore, the correct choice is (B).\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer**: The expert answer suggests using torchtune's utility functions to set only LoRA parameters as trainable. It involves fetching LoRA parameters using `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also mentions that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer**: The submitted answer provides a method using the `transformers` library. It involves creating a LoRA adapter, freezing the pre-trained model weights, and setting LoRA parameters to trainable. It provides a detailed code example for this process.\\n\\n3. **Comparison**:\\n - The expert answer focuses on using torchtune's utility functions, while the submitted answer uses the `transformers` library.\\n - Both answers aim to achieve the same goal: making only LoRA parameters trainable.\\n - The submitted answer provides a more detailed explanation and code example, which is not present in the expert answer.\\n - There is no direct conflict in the factual content, as both methods are valid ways to achieve the same result.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer because it includes additional details and a code example, while still being consistent with the expert's goal of making only LoRA parameters trainable. Therefore, the correct choice is (B).\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ }\n", + "│ │ │ ]\n", + "│ │ )\n", + "│ }\n", + ")\n", + "\n" + ], + "text/plain": [ + "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.44999999999999996\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Expert Answer Analysis**: The expert answer states that Torchtune supports two precision formats: fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and bfloat16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n - fp32 is described as using 4 bytes per model and optimizer parameter.\\n - bfloat16 is described as using 2 bytes per model and optimizer parameter.\\n\\n2. **Submitted Answer Analysis**: The submitted answer lists the following precision formats:\\n - bf16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32m16-bit floating-point format\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n - fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32m32-bit floating-point format, also known as \"full-precision\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n - Additionally, it mentions support for mixed-precision techniques.\\n\\n3. **Comparison**:\\n - Both answers mention fp32 and bf16/bfloat16, which are essentially the same formats, though the naming differs slightly \u001b[0m\u001b[32m(\u001b[0m\u001b[32mbf16 vs bfloat16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n - The submitted answer includes additional information about mixed-precision techniques, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**:\\n - The submitted answer includes all the information from the expert answer and adds more details about mixed-precision techniques.\\n - There is no conflict between the two answers; the submitted answer is more detailed.\\n\\nTherefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Offline Reweighting Adapter.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for.\\n5. Therefore, the correct choice is that there is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU by setting `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer mentions offloading both gradients and optimizer states to the CPU, which aligns with the expert's mention of keeping optimizer states on the CPU and optionally offloading gradients. It also adds details about freeing up GPU memory, the process of transferring gradients once the device-to-host transfer finishes, and additional strategies like using a stateful optimizer and not using gradient accumulation to minimize slowdown.\\n\\n3. **Comparison**: The submitted answer includes all the points mentioned in the expert answer:\\n - Offloading optimizer states to the CPU.\\n - Optionally offloading gradients to the CPU with `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n - It adds more details about the process and additional strategies to minimize slowdown, which are not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and adds more details. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer**: The expert answer suggests using torchtune's utility functions to set only LoRA parameters as trainable. It involves fetching LoRA parameters using `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also mentions that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer**: The submitted answer provides a method using the `transformers` library. It involves creating a LoRA adapter, freezing the pre-trained model weights, and setting LoRA parameters to trainable. It provides a detailed code example for this process.\\n\\n3. **Comparison**:\\n - The expert answer focuses on using torchtune's utility functions, while the submitted answer uses the `transformers` library.\\n - Both answers aim to achieve the same goal: making only LoRA parameters trainable.\\n - The submitted answer provides a more detailed explanation and code example, which is not present in the expert answer.\\n - There is no direct conflict in the factual content, as both methods are valid ways to achieve the same result.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer because it includes additional details and a code example, while still being consistent with the expert's goal of making only LoRA parameters trainable. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "eval_rows = []\n", + "session_response = client.agents.session.retrieve(agent_id=search_agent.agent_id, session_id=search_session_id)\n", + "for i, turn in enumerate(session_response.turns):\n", + " eval_rows.append({\n", + " \"input_query\": examples[i][\"input_query\"],\n", + " \"expected_answer\": examples[i][\"expected_answer\"],\n", + " \"generated_answer\": turn.output_message.content,\n", + " })\n", + "\n", + "scoring_params = {\n", + " \"braintrust::factuality\": None,\n", + "}\n", + "scoring_response = client.scoring.score(\n", + " input_rows=eval_rows,\n", + " scoring_functions=scoring_params,\n", + ")\n", + "pprint(scoring_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. RAG Agent\n", + "\n", + "Now, let's see how we can improve the agent's performance by adding a RAG tool that explicitly retrieves information from Torchtune's documentation. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client.types import Document\n", + "urls = [\n", + " \"memory_optimizations.rst\",\n", + " \"chat.rst\",\n", + " \"llama3.rst\",\n", + " \"datasets.rst\",\n", + " \"qat_finetune.rst\",\n", + " \"lora_finetune.rst\",\n", + "]\n", + "documents = [\n", + " Document(\n", + " document_id=f\"num-{i}\",\n", + " content=f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n", + " mime_type=\"text/plain\",\n", + " metadata={},\n", + " )\n", + " for i, url in enumerate(urls)\n", + "]\n", + "\n", + "vector_providers = [\n", + " provider for provider in client.providers.list() if provider.api == \"vector_io\"\n", + "]\n", + "selected_vector_provider = vector_providers[0]\n", + "vector_db_id = f\"test_vector_db_{uuid.uuid4()}\"\n", + "client.vector_dbs.register(\n", + " vector_db_id=vector_db_id,\n", + " embedding_model=\"all-MiniLM-L6-v2\",\n", + " embedding_dimension=384,\n", + " provider_id=selected_vector_provider.provider_id,\n", + ")\n", + "\n", + "client.tool_runtime.rag_tool.insert(\n", + " documents=documents,\n", + " vector_db_id=vector_db_id,\n", + " chunk_size_in_tokens=512,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Question: What precision formats does torchtune support?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: Torchtune supports the following precision formats: \n", + "\n", + "1. fp32 (full precision, 4 bytes per model and optimizer parameter)\n", + "2. bfloat16 (half-precision, 2 bytes per model and optimizer parameter)\n", + "3. int8 (integer 8-bit, used for quantized models)\n", + "4. int4 (integer 4-bit, used for quantized models)\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats: \n", + "\n", + "\u001b[1;36m1\u001b[0m. fp32 \u001b[1m(\u001b[0mfull precision, \u001b[1;36m4\u001b[0m bytes per model and optimizer parameter\u001b[1m)\u001b[0m\n", + "\u001b[1;36m2\u001b[0m. bfloat16 \u001b[1m(\u001b[0mhalf-precision, \u001b[1;36m2\u001b[0m bytes per model and optimizer parameter\u001b[1m)\u001b[0m\n", + "\u001b[1;36m3\u001b[0m. int8 \u001b[1m(\u001b[0minteger \u001b[1;36m8\u001b[0m-bit, used for quantized models\u001b[1m)\u001b[0m\n", + "\u001b[1;36m4\u001b[0m. int4 \u001b[1m(\u001b[0minteger \u001b[1;36m4\u001b[0m-bit, used for quantized models\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: What does DoRA stand for in torchtune?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: DoRA stands for \"Decoupled Offline Replay Alignment\" in torchtune.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for \u001b[32m\"Decoupled Offline Replay Alignment\"\u001b[0m in torchtune.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Question: How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU,\n",
+ "and performing optimizer steps on CPU. This can significantly reduce GPU memory usage at the cost of CPU RAM and \n",
+ "training speed. It is recommended to use this optimizer only if other techniques are not enough. Additionally, it \n",
+ "is suggested to use full bf16 training to minimize the slowdown and give GPU more work per optimizer step to \n",
+ "amortize the offloading time.\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU,\n",
+ "and performing optimizer steps on CPU. This can significantly reduce GPU memory usage at the cost of CPU RAM and \n",
+ "training speed. It is recommended to use this optimizer only if other techniques are not enough. Additionally, it \n",
+ "is suggested to use full bf16 training to minimize the slowdown and give GPU more work per optimizer step to \n",
+ "amortize the offloading time.\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Question: How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Agent Answer: To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n", + "function from `torchtune.modules.peft.peft_utils` to set `requires_grad=True` on LoRA parameters and \n", + "`requires_grad=False` on all other parameters. This can be done after loading the base model weights into the LoRA \n", + "model. Additionally, you can use the `get_adapter_params` function to fetch all parameters associated with LoRA.\n", + "\n" + ], + "text/plain": [ + "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n", + "function from `torchtune.modules.peft.peft_utils` to set `\u001b[33mrequires_grad\u001b[0m=\u001b[3;92mTrue\u001b[0m` on LoRA parameters and \n", + "`\u001b[33mrequires_grad\u001b[0m=\u001b[3;91mFalse\u001b[0m` on all other parameters. This can be done after loading the base model weights into the LoRA \n", + "model. Additionally, you can use the `get_adapter_params` function to fetch all parameters associated with LoRA.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rag_agent = Agent(\n", + " client,\n", + " model=MODEL_ID,\n", + " instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. You should always use the RAG tool to answer questions.\",\n", + " tools=[{\n", + " \"name\": \"builtin::rag\",\n", + " \"args\": {\"vector_db_ids\": [vector_db_id]},\n", + " }],\n", + ")\n", + "\n", + "rag_session_id = rag_agent.create_session(session_name=f\"rag_session_{uuid.uuid4()}\")\n", + "for example in examples:\n", + " response = rag_agent.create_turn(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": example[\"input_query\"]\n", + " }\n", + " ],\n", + " session_id=rag_session_id,\n", + " stream=False\n", + " )\n", + " rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n", + " rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ScoringScoreResponse(\n", + "│ results={\n", + "│ │ 'braintrust::factuality': ScoringResult(\n", + "│ │ │ aggregated_results={'average': {'average': 0.44999999999999996}},\n", + "│ │ │ score_rows=[\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': \"1. The expert answer states that Torchtune supports two precision formats: fp32 and bfloat16.\\n2. The submitted answer includes the same two precision formats: fp32 and bfloat16, with the same descriptions regarding their byte usage.\\n3. Additionally, the submitted answer mentions two more precision formats: int8 and int4, which are used for quantized models.\\n4. Since the submitted answer includes all the information from the expert answer and adds more details (int8 and int4), it is a superset of the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the additional information does not contradict the expert's information.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.0,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'D',\n", + "│ │ │ │ │ │ 'rationale': '1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Offline Replay Alignment.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.'\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer Content**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU using `offload_gradients=True`.\\n\\n2. **Submitted Answer Content**: The submitted answer includes all the points mentioned in the expert answer: offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. Additionally, it provides more context by discussing the trade-offs involved, such as increased CPU RAM usage and potential training speed reduction. It also suggests using full bf16 training to mitigate these issues.\\n\\n3. **Comparison**: The submitted answer contains all the factual elements of the expert answer and adds more information about the implications and recommendations for using the CPUOffloadOptimizer. There is no conflict between the two answers; rather, the submission expands on the expert's points.\\n\\n4. **Conclusion**: Since the submitted answer includes all the details from the expert answer and adds additional relevant information, it is a superset of the expert answer and is fully consistent with it.\\n\\nTherefore, the correct choice is (B).\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ },\n", + "│ │ │ │ {\n", + "│ │ │ │ │ 'score': 0.6,\n", + "│ │ │ │ │ 'metadata': {\n", + "│ │ │ │ │ │ 'choice': 'B',\n", + "│ │ │ │ │ │ 'rationale': \"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer also describes using `set_trainable_params` to set `requires_grad=True` on LoRA parameters and `requires_grad=False` on others. It mentions using `get_adapter_params` to fetch LoRA parameters and suggests doing this after loading the base model weights into the LoRA model.\\n\\n3. **Comparison**:\\n - Both answers mention using `get_adapter_params` to fetch LoRA parameters.\\n - Both answers mention using `set_trainable_params` to set LoRA parameters as trainable.\\n - The submitted answer provides additional detail about setting `requires_grad=False` on other parameters and doing this after loading base model weights, which is not mentioned in the expert answer.\\n - The expert answer mentions that the LoRA recipe handles this automatically, which is not mentioned in the submitted answer.\\n\\n4. **Conclusion**: The submitted answer includes all the details from the expert answer and adds more information about setting `requires_grad` and the sequence of operations. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.\"\n", + "│ │ │ │ │ }\n", + "│ │ │ │ }\n", + "│ │ │ ]\n", + "│ │ )\n", + "│ }\n", + ")\n", + "\n" + ], + "text/plain": [ + "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.44999999999999996\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. The expert answer states that Torchtune supports two precision formats: fp32 and bfloat16.\\n2. The submitted answer includes the same two precision formats: fp32 and bfloat16, with the same descriptions regarding their byte usage.\\n3. Additionally, the submitted answer mentions two more precision formats: int8 and int4, which are used for quantized models.\\n4. Since the submitted answer includes all the information from the expert answer and adds more details \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 and int4\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, it is a superset of the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the additional information does not contradict the expert's information.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Offline Replay Alignment.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Content**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU using `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\n2. **Submitted Answer Content**: The submitted answer includes all the points mentioned in the expert answer: offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. Additionally, it provides more context by discussing the trade-offs involved, such as increased CPU RAM usage and potential training speed reduction. It also suggests using full bf16 training to mitigate these issues.\\n\\n3. **Comparison**: The submitted answer contains all the factual elements of the expert answer and adds more information about the implications and recommendations for using the CPUOffloadOptimizer. There is no conflict between the two answers; rather, the submission expands on the expert's points.\\n\\n4. **Conclusion**: Since the submitted answer includes all the details from the expert answer and adds additional relevant information, it is a superset of the expert answer and is fully consistent with it.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer also describes using `set_trainable_params` to set `\u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m` on LoRA parameters and `\u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m` on others. It mentions using `get_adapter_params` to fetch LoRA parameters and suggests doing this after loading the base model weights into the LoRA model.\\n\\n3. **Comparison**:\\n - Both answers mention using `get_adapter_params` to fetch LoRA parameters.\\n - Both answers mention using `set_trainable_params` to set LoRA parameters as trainable.\\n - The submitted answer provides additional detail about setting `\u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m` on other parameters and doing this after loading base model weights, which is not mentioned in the expert answer.\\n - The expert answer mentions that the LoRA recipe handles this automatically, which is not mentioned in the submitted answer.\\n\\n4. **Conclusion**: The submitted answer includes all the details from the expert answer and adds more information about setting `requires_grad` and the sequence of operations. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "eval_rows = []\n", + "session_response = client.agents.session.retrieve(agent_id=rag_agent.agent_id, session_id=rag_session_id)\n", + "for i, turn in enumerate(session_response.turns):\n", + " eval_rows.append({\n", + " \"input_query\": examples[i][\"input_query\"],\n", + " \"expected_answer\": examples[i][\"expected_answer\"],\n", + " \"generated_answer\": turn.output_message.content,\n", + " })\n", + "\n", + "scoring_params = {\n", + " \"braintrust::factuality\": None,\n", + "}\n", + "scoring_response = client.scoring.score(\n", + " input_rows=eval_rows,\n", + " scoring_functions=scoring_params,\n", + ")\n", + "pprint(scoring_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Deep dive into RAG Tool Performance\n", + "- Now, let's take a closer look at how the RAG tool is doing, specifically on the second example where the agent's answer is not correct on identifying what DoRA stands for. \n", + "- Notice that the issue lies with the retrieval step, where the retrieved document is not relevant to the question. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Turn(\n", + "│ input_messages=[UserMessage(content='What does DoRA stand for in torchtune?', role='user', context=None)],\n", + "│ output_message=CompletionMessage(\n", + "│ │ content='DoRA stands for \"Decoupled Offline Replay Alignment\" in torchtune.',\n", + "│ │ role='assistant',\n", + "│ │ stop_reason='end_of_turn',\n", + "│ │ tool_calls=[]\n", + "│ ),\n", + "│ session_id='b858ba10-8d18-4e1f-b7d0-5684daff0e71',\n", + "│ started_at=datetime.datetime(2025, 3, 6, 15, 45, 27, 129881, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=57600))),\n", + "│ steps=[\n", + "│ │ InferenceStep(\n", + "│ │ │ api_model_response=CompletionMessage(\n", + "│ │ │ │ content='',\n", + "│ │ │ │ role='assistant',\n", + "│ │ │ │ stop_reason='end_of_turn',\n", + "│ │ │ │ tool_calls=[\n", + "│ │ │ │ │ ToolCall(\n", + "│ │ │ │ │ │ arguments={'query': 'DoRA meaning in torchtune'},\n", + "│ │ │ │ │ │ call_id='abc4bb5c-ecc5-42a9-a604-c7a5b76220a9',\n", + "│ │ │ │ │ │ tool_name='knowledge_search'\n", + "│ │ │ │ │ )\n", + "│ │ │ │ ]\n", + "│ │ │ ),\n", + "│ │ │ step_id='c50c84e6-5ca7-4aa7-8c53-e01d639275eb',\n", + "│ │ │ step_type='inference',\n", + "│ │ │ turn_id='169a465d-9f29-43aa-9c70-e41393a9a504',\n", + "│ │ │ completed_at=datetime.datetime(2025, 3, 6, 15, 45, 27, 900249, tzinfo=TzInfo(-08:00)),\n", + "│ │ │ started_at=datetime.datetime(2025, 3, 6, 15, 45, 27, 130048, tzinfo=TzInfo(-08:00))\n", + "│ │ ),\n", + "│ │ ToolExecutionStep(\n", + "│ │ │ step_id='c1f98a74-0837-4f6d-9e70-7ec6b12d6413',\n", + "│ │ │ step_type='tool_execution',\n", + "│ │ │ tool_calls=[\n", + "│ │ │ │ ToolCall(\n", + "│ │ │ │ │ arguments={'query': 'DoRA meaning in torchtune'},\n", + "│ │ │ │ │ call_id='abc4bb5c-ecc5-42a9-a604-c7a5b76220a9',\n", + "│ │ │ │ │ tool_name='knowledge_search'\n", + "│ │ │ │ )\n", + "│ │ │ ],\n", + "│ │ │ tool_responses=[\n", + "│ │ │ │ ToolResponse(\n", + "│ │ │ │ │ call_id='abc4bb5c-ecc5-42a9-a604-c7a5b76220a9',\n", + "│ │ │ │ │ content=[\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n',\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text='Result 1:\\nDocument_id:num-0\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n',\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text='Result 2:\\nDocument_id:num-1\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\"json\",\\n data_files=\"data/my_data.json\",\\n split=\"train\",\\n conversation_column=\"dialogue\",\\n conversation_style=\"sharegpt\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n',\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text=\"Result 3:\\nDocument_id:num-5\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune<overview_label>`\\n * Make sure to :ref:`install torchtune<install_label>`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\",\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text='Result 4:\\nDocument_id:num-0\\nContent: use the :class:`torch.optim.AdamW` optimizer with ``fused=True`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n tune run <RECIPE> --config <CONFIG> \\\\\\n optimizer=optimizer=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n optimizer.offload_gradients=True \\\\\\n lr=4e-5\\n\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n optimizer:\\n _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n offload_gradients: True\\n # additional key-word arguments can be passed to torch.optim.AdamW\\n lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer(\\n model.parameters(), # your model here\\n Adam,\\n lr=1e-5,\\n fused=True\\n )\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to (1) use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and (2) give GPU more work per optimizer step to amortize the offloading time (e.g. larger batch size with activation checkpointing, gradient accumulation).\\n* Gradient accumulation should always be set to 1 when ``offload_gradients=True``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``fsdp_cpu_offload=True`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 <https://github.com/pytorch/torchtitan/blob/main/docs/fsdp\\n',\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(\n", + "│ │ │ │ │ │ │ text='Result 5:\\nDocument_id:num-5\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\"\"\"\\n {total_params} total params,\\n {trainable_params}\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \"\"\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n',\n", + "│ │ │ │ │ │ │ type='text'\n", + "│ │ │ │ │ │ ),\n", + "│ │ │ │ │ │ TextContentItem(text='END of knowledge_search tool results.\\n', type='text')\n", + "│ │ │ │ │ ],\n", + "│ │ │ │ │ tool_name='knowledge_search',\n", + "│ │ │ │ │ metadata={'document_ids': ['num-0', 'num-1', 'num-5', 'num-0', 'num-5']}\n", + "│ │ │ │ )\n", + "│ │ │ ],\n", + "│ │ │ turn_id='169a465d-9f29-43aa-9c70-e41393a9a504',\n", + "│ │ │ completed_at=datetime.datetime(2025, 3, 6, 15, 45, 28, 79984, tzinfo=TzInfo(-08:00)),\n", + "│ │ │ started_at=datetime.datetime(2025, 3, 6, 15, 45, 27, 935151, tzinfo=TzInfo(-08:00))\n", + "│ │ ),\n", + "│ │ InferenceStep(\n", + "│ │ │ api_model_response=CompletionMessage(\n", + "│ │ │ │ content='DoRA stands for \"Decoupled Offline Replay Alignment\" in torchtune.',\n", + "│ │ │ │ role='assistant',\n", + "│ │ │ │ stop_reason='end_of_turn',\n", + "│ │ │ │ tool_calls=[]\n", + "│ │ │ ),\n", + "│ │ │ step_id='10712851-4808-4d6c-b80c-533c5ce23bab',\n", + "│ │ │ step_type='inference',\n", + "│ │ │ turn_id='169a465d-9f29-43aa-9c70-e41393a9a504',\n", + "│ │ │ completed_at=datetime.datetime(2025, 3, 6, 15, 45, 28, 790784, tzinfo=TzInfo(-08:00)),\n", + "│ │ │ started_at=datetime.datetime(2025, 3, 6, 15, 45, 28, 90603, tzinfo=TzInfo(-08:00))\n", + "│ │ )\n", + "│ ],\n", + "│ turn_id='169a465d-9f29-43aa-9c70-e41393a9a504',\n", + "│ completed_at=datetime.datetime(2025, 3, 6, 15, 45, 28, 802068, tzinfo=TzInfo(-08:00)),\n", + "│ output_attachments=[]\n", + ")\n", + "\n" + ], + "text/plain": [ + "\u001b[1;35mTurn\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33minput_messages\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mUserMessage\u001b[0m\u001b[1m(\u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'What does DoRA stand for in torchtune?'\u001b[0m, \u001b[33mrole\u001b[0m=\u001b[32m'user'\u001b[0m, \u001b[33mcontext\u001b[0m=\u001b[3;35mNone\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33moutput_message\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'DoRA stands for \"Decoupled Offline Replay Alignment\" in torchtune.'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33msession_id\u001b[0m=\u001b[32m'b858ba10-8d18-4e1f-b7d0-5684daff0e71'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m6\u001b[0m, \u001b[1;36m15\u001b[0m, \u001b[1;36m45\u001b[0m, \u001b[1;36m27\u001b[0m, \u001b[1;36m129881\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.timezone\u001b[0m\u001b[1m(\u001b[0m\u001b[1;35mdatetime.timedelta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdays\u001b[0m=\u001b[1;36m-1\u001b[0m, \u001b[33mseconds\u001b[0m=\u001b[1;36m57600\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33msteps\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mInferenceStep\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mapi_model_response\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m''\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;35mToolCall\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33marguments\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'query'\u001b[0m: \u001b[32m'DoRA meaning in torchtune'\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'abc4bb5c-ecc5-42a9-a604-c7a5b76220a9'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mtool_name\u001b[0m=\u001b[32m'knowledge_search'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'c50c84e6-5ca7-4aa7-8c53-e01d639275eb'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'inference'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'169a465d-9f29-43aa-9c70-e41393a9a504'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m6\u001b[0m, \u001b[1;36m15\u001b[0m, \u001b[1;36m45\u001b[0m, \u001b[1;36m27\u001b[0m, \u001b[1;36m900249\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m6\u001b[0m, \u001b[1;36m15\u001b[0m, \u001b[1;36m45\u001b[0m, \u001b[1;36m27\u001b[0m, \u001b[1;36m130048\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mToolExecutionStep\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'c1f98a74-0837-4f6d-9e70-7ec6b12d6413'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'tool_execution'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;35mToolCall\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33marguments\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'query'\u001b[0m: \u001b[32m'DoRA meaning in torchtune'\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'abc4bb5c-ecc5-42a9-a604-c7a5b76220a9'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mtool_name\u001b[0m=\u001b[32m'knowledge_search'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mtool_responses\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;35mToolResponse\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'abc4bb5c-ecc5-42a9-a604-c7a5b76220a9'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ │ \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'text'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'Result 1:\\nDocument_id:num-0\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA \u001b[0m\u001b[32m<\u001b[0m\u001b[32mglossary_lora\u001b[0m\u001b[32m>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``\u001b[0m\u001b[32mquantize\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m16\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mquantize_base\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFSDP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP