error handling

2025-08-06 10:42:39 +00:00 · 2025-03-23 15:31:19 -07:00 · 2025-03-23 15:31:19 -07:00 · 31d27115f8
commit 31d27115f8
parent 327e4fcd97
2 changed files with 42 additions and 39 deletions
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@ -70,7 +70,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1409,7 +1409,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
@ -1430,13 +1430,13 @@
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports two precision formats: `fp32` <span style=\"font-weight: bold\">(</span>full-precision<span style=\"font-weight: bold\">)</span> and `bfloat16` <span style=\"font-weight: bold\">(</span>half-precision<span style=\"font-weight: bold\">)</span>. \n",
       "The `bfloat16` format uses <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
-       "training speed. The default setting for Torchtune recipes is `bfloat16`.\n",
+       "training speed.\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports two precision formats: `fp32` \u001b[1m(\u001b[0mfull-precision\u001b[1m)\u001b[0m and `bfloat16` \u001b[1m(\u001b[0mhalf-precision\u001b[1m)\u001b[0m. \n",
       "The `bfloat16` format uses \u001b[1;36m2\u001b[0m bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
-       "training speed. The default setting for Torchtune recipes is `bfloat16`.\n"
+       "training speed.\n"
      ]
     },
     "metadata": {},
@ -1458,21 +1458,21 @@
    {
     "data": {
      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a parameter-efficient fine-tuning \n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA <span style=\"font-weight: bold\">(</span>Low-Rank Adaptation<span style=\"font-weight: bold\">)</span> \n",
-       "technique that builds on top of LoRA <span style=\"font-weight: bold\">(</span>Low-Rank Adaptation<span style=\"font-weight: bold\">)</span> by further decomposing the pre-trained weights into two \n",
+       "that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
-       "components: magnitude and direction. The magnitude component is a scalar vector that adjusts the scale, while the \n",
+       "component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
-       "direction component corresponds to the original LoRA decomposition and updates the orientation of weights. DoRA \n",
+       "decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
-       "adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to \n",
+       "addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
-       "improve the performance of LoRA, particularly at low ranks.\n",
+       "ranks.\n",
       "</pre>\n"
      ],
      "text/plain": [
-       "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a parameter-efficient fine-tuning \n",
+       "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m \n",
-       "technique that builds on top of LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m by further decomposing the pre-trained weights into two \n",
+       "that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
-       "components: magnitude and direction. The magnitude component is a scalar vector that adjusts the scale, while the \n",
+       "component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
-       "direction component corresponds to the original LoRA decomposition and updates the orientation of weights. DoRA \n",
+       "decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
-       "adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to \n",
+       "addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
-       "improve the performance of LoRA, particularly at low ranks.\n"
+       "ranks.\n"
      ]
     },
     "metadata": {},
@ -1495,14 +1495,14 @@
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
-       "CPU, and performing optimizer steps on the CPU. This can be used to significantly reduce GPU memory usage at the \n",
+       "CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
-       "cost of CPU RAM and training speed.\n",
+       "RAM and training speed.\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
-       "CPU, and performing optimizer steps on the CPU. This can be used to significantly reduce GPU memory usage at the \n",
+       "CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
-       "cost of CPU RAM and training speed.\n"
+       "RAM and training speed.\n"
      ]
     },
     "metadata": {},
@ -1547,8 +1547,8 @@
       "```bash\n",
       "tune run lora_finetune --config llama2/7B_lora\n",
       "```\n",
-       "This will fine-tune the LoRA parameters of the Llama2 model using the default configuration. You can modify the \n",
+       "This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
-       "configuration to suit your specific needs.\n",
+       "file to change the hyperparameters or the model architecture.\n",
       "</pre>\n"
      ],
      "text/plain": [
@ -1575,8 +1575,8 @@
       "```bash\n",
       "tune run lora_finetune --config llama2/7B_lora\n",
       "```\n",
-       "This will fine-tune the LoRA parameters of the Llama2 model using the default configuration. You can modify the \n",
+       "This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
-       "configuration to suit your specific needs.\n"
+       "file to change the hyperparameters or the model architecture.\n"
      ]
     },
     "metadata": {},
@ -1595,7 +1595,9 @@
    "\n",
    "attachments = [\n",
    "    {\n",
-    "        \"content\": f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
+    "        \"content\": {\n",
    "            \"uri\": f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
    "        },\n",
    "        \"mime_type\": \"text/plain\",\n",
    "    }\n",
    "\n",
@ -1628,7 +1630,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -1643,21 +1645,21 @@
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
-       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. **Precision Formats**: Both the expert and the submitted answers mention the same precision formats supported by Torchtune: `fp32` and `bfloat16`. This is consistent between both answers.\\n\\n2. **Memory Usage**: Both answers state that `bfloat16` uses 2 bytes per model parameter, which is half the memory of `fp32` that uses 4 bytes. This is consistent between both answers.\\n\\n3. **Additional Information in Submission**: The submitted answer includes additional information that `bfloat16` improves training speed and is the default setting for Torchtune recipes. This information is not present in the expert answer.\\n\\n4. **Consistency**: The additional information in the submitted answer does not contradict the expert answer. It provides extra details that are not covered by the expert answer.\\n\\nBased on the above analysis, the submitted answer is a superset of the expert answer and is fully consistent with it. Therefore, the correct choice is (B).'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision).\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
-       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining it as a parameter-efficient fine-tuning technique that builds on LoRA by decomposing weights into magnitude and direction components.\\n4. The submitted answer further explains the roles of the magnitude and direction components and mentions the performance improvement of LoRA due to DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand on the concept.\\n6. Since the submitted answer includes all the information from the expert answer and adds more details without any contradiction, it is a superset of the expert answer and is fully consistent with it.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
-       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU using `offload_gradients=True`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. **Comparison**:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with `offload_gradients=True`, while the submission does not specify this as optional.\\n   - The submitted answer adds information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer includes all the details from the expert answer and adds additional information about the trade-offs. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter offload_gradients=True.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n   - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
@ -1683,21 +1685,21 @@
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
-       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Precision Formats**: Both the expert and the submitted answers mention the same precision formats supported by Torchtune: `fp32` and `bfloat16`. This is consistent between both answers.\\n\\n2. **Memory Usage**: Both answers state that `bfloat16` uses 2 bytes per model parameter, which is half the memory of `fp32` that uses 4 bytes. This is consistent between both answers.\\n\\n3. **Additional Information in Submission**: The submitted answer includes additional information that `bfloat16` improves training speed and is the default setting for Torchtune recipes. This information is not present in the expert answer.\\n\\n4. **Consistency**: The additional information in the submitted answer does not contradict the expert answer. It provides extra details that are not covered by the expert answer.\\n\\nBased on the above analysis, the submitted answer is a superset of the expert answer and is fully consistent with it. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
-       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining it as a parameter-efficient fine-tuning technique that builds on LoRA by decomposing weights into magnitude and direction components.\\n4. The submitted answer further explains the roles of the magnitude and direction components and mentions the performance improvement of LoRA due to DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand on the concept.\\n6. Since the submitted answer includes all the information from the expert answer and adds more details without any contradiction, it is a superset of the expert answer and is fully consistent with it.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
-       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU using `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. **Comparison**:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`, while the submission does not specify this as optional.\\n   - The submitted answer adds information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer includes all the details from the expert answer and adds additional information about the trade-offs. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter \u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n   - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -453,7 +453,9 @@ class ChatAgent(ShieldRunnerMixin):
            for document in documents:
                raw_document_text = await get_raw_document_text(document)
                contexts.append(raw_document_text)
-            input_messages[-1].context = "\n".join(contexts)
+        
            attached_context = "\n".join(contexts)
            input_messages[-1].context = attached_context
        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
@ -889,17 +891,16 @@ async def load_data_from_url(url: str) -> str:
            r = await client.get(url)
            resp = r.text
            return resp
-    return ""
+    raise ValueError(f"Unexpected URL: {type(url)}")
 async def get_raw_document_text(document: Document) -> str:
    if not document.mime_type.startswith("text/"):
        raise ValueError(f"Unexpected document mime type: {document.mime_type}")
    if isinstance(document.content, URL):
        return await load_data_from_url(document.content.uri)
    elif isinstance(document.content, str):
-        if document.content.startswith("http"):
+        return document.content
            return await load_data_from_url(document.content)
        else:
            return document.content
    elif isinstance(document.content, TextContentItem):
        return document.content.text
    else: