chore: refactor create_and_execute_turn and resume_turn (#1399)

# What does this PR do?
- Closes https://github.com/meta-llama/llama-stack/issues/1212

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v tests/integration/agents/test_agents.py --inference-model "meta-llama/Llama-3.3-70B-Instruct"
```
<img width="1203" alt="image"
src="https://github.com/user-attachments/assets/35b60017-b3f2-4e98-87f2-2868730261bd"
/>

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/agents/test_agents.py::test_rag_and_code_agent --inference-model "meta-llama/Llama-3.3-70B-Instruct"
```

[//]: # (## Documentation)
This commit is contained in:
Xi Yan 2025-03-04 16:07:30 -08:00 committed by GitHub
parent abfbaf3c1b
commit 78962be996
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 2447 additions and 487 deletions

View file

@ -12,7 +12,7 @@ import secrets
import string import string
import uuid import uuid
from datetime import datetime from datetime import datetime
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx import httpx
@ -31,7 +31,6 @@ from llama_stack.apis.agents import (
AgentTurnResponseStreamChunk, AgentTurnResponseStreamChunk,
AgentTurnResponseTurnAwaitingInputPayload, AgentTurnResponseTurnAwaitingInputPayload,
AgentTurnResponseTurnCompletePayload, AgentTurnResponseTurnCompletePayload,
AgentTurnResponseTurnStartPayload,
AgentTurnResumeRequest, AgentTurnResumeRequest,
Attachment, Attachment,
Document, Document,
@ -184,115 +183,49 @@ class ChatAgent(ShieldRunnerMixin):
span.set_attribute("session_id", request.session_id) span.set_attribute("session_id", request.session_id)
span.set_attribute("agent_id", self.agent_id) span.set_attribute("agent_id", self.agent_id)
span.set_attribute("request", request.model_dump_json()) span.set_attribute("request", request.model_dump_json())
assert request.stream is True, "Non-streaming not supported"
session_info = await self.storage.get_session_info(request.session_id)
if session_info is None:
raise ValueError(f"Session {request.session_id} not found")
turns = await self.storage.get_session_turns(request.session_id)
messages = await self.get_messages_from_turns(turns)
messages.extend(request.messages)
turn_id = str(uuid.uuid4()) turn_id = str(uuid.uuid4())
span.set_attribute("turn_id", turn_id) span.set_attribute("turn_id", turn_id)
start_time = datetime.now().astimezone().isoformat() async for chunk in self._run_turn(request, turn_id):
yield AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnStartPayload(
turn_id=turn_id,
)
)
)
steps = []
output_message = None
async for chunk in self.run(
session_id=request.session_id,
turn_id=turn_id,
input_messages=messages,
sampling_params=self.agent_config.sampling_params,
stream=request.stream,
documents=request.documents,
toolgroups_for_turn=request.toolgroups,
):
if isinstance(chunk, CompletionMessage):
logcat.info(
"agents",
f"returning result from the agent turn: {chunk}",
)
output_message = chunk
continue
assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
event = chunk.event
if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
steps.append(event.payload.step_details)
yield chunk yield chunk
assert output_message is not None
turn = Turn(
turn_id=turn_id,
session_id=request.session_id,
input_messages=request.messages,
output_message=output_message,
started_at=start_time,
completed_at=datetime.now().astimezone().isoformat(),
steps=steps,
)
await self.storage.add_turn_to_session(request.session_id, turn)
if output_message.tool_calls:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnAwaitingInputPayload(
turn=turn,
)
)
)
else:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnCompletePayload(
turn=turn,
)
)
)
yield chunk
async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator: async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
with tracing.span("resume_turn") as span: with tracing.span("resume_turn") as span:
span.set_attribute("agent_id", self.agent_id) span.set_attribute("agent_id", self.agent_id)
span.set_attribute("session_id", request.session_id) span.set_attribute("session_id", request.session_id)
span.set_attribute("turn_id", request.turn_id) span.set_attribute("turn_id", request.turn_id)
span.set_attribute("request", request.model_dump_json()) span.set_attribute("request", request.model_dump_json())
assert request.stream is True, "Non-streaming not supported" async for chunk in self._run_turn(request):
yield chunk
session_info = await self.storage.get_session_info(request.session_id) async def _run_turn(
if session_info is None: self,
raise ValueError(f"Session {request.session_id} not found") request: Union[AgentTurnCreateRequest, AgentTurnResumeRequest],
turn_id: Optional[str] = None,
) -> AsyncGenerator:
assert request.stream is True, "Non-streaming not supported"
turns = await self.storage.get_session_turns(request.session_id) is_resume = isinstance(request, AgentTurnResumeRequest)
if len(turns) == 0: session_info = await self.storage.get_session_info(request.session_id)
raise ValueError("No turns found for session") if session_info is None:
raise ValueError(f"Session {request.session_id} not found")
messages = await self.get_messages_from_turns(turns) turns = await self.storage.get_session_turns(request.session_id)
if is_resume and len(turns) == 0:
raise ValueError("No turns found for session")
steps = []
messages = await self.get_messages_from_turns(turns)
if is_resume:
messages.extend(request.tool_responses) messages.extend(request.tool_responses)
last_turn = turns[-1] last_turn = turns[-1]
last_turn_messages = self.turn_to_messages(last_turn) last_turn_messages = self.turn_to_messages(last_turn)
last_turn_messages = [ last_turn_messages = [
x for x in last_turn_messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage) x for x in last_turn_messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage)
] ]
# TODO: figure out whether we should add the tool responses to the last turn messages
last_turn_messages.extend(request.tool_responses) last_turn_messages.extend(request.tool_responses)
# get the steps from the turn id # get steps from the turn
steps = [] steps = last_turn.steps
steps = turns[-1].steps
# mark tool execution step as complete # mark tool execution step as complete
# if there's no tool execution in progress step (due to storage, or tool call parsing on client), # if there's no tool execution in progress step (due to storage, or tool call parsing on client),
@ -326,62 +259,67 @@ class ChatAgent(ShieldRunnerMixin):
) )
) )
) )
input_messages = last_turn_messages
output_message = None turn_id = request.turn_id
async for chunk in self.run( start_time = last_turn.started_at
session_id=request.session_id, else:
turn_id=request.turn_id, messages.extend(request.messages)
input_messages=messages, start_time = datetime.now().astimezone().isoformat()
sampling_params=self.agent_config.sampling_params, input_messages = request.messages
stream=request.stream,
):
if isinstance(chunk, CompletionMessage):
output_message = chunk
continue
assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}" output_message = None
event = chunk.event async for chunk in self.run(
if event.payload.event_type == AgentTurnResponseEventType.step_complete.value: session_id=request.session_id,
steps.append(event.payload.step_details) turn_id=turn_id,
input_messages=messages,
sampling_params=self.agent_config.sampling_params,
stream=request.stream,
documents=request.documents if not is_resume else None,
toolgroups_for_turn=request.toolgroups if not is_resume else None,
):
if isinstance(chunk, CompletionMessage):
output_message = chunk
continue
yield chunk assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
event = chunk.event
assert output_message is not None if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
steps.append(event.payload.step_details)
last_turn_start_time = datetime.now().astimezone().isoformat()
if len(turns) > 0:
last_turn_start_time = turns[-1].started_at
turn = Turn(
turn_id=request.turn_id,
session_id=request.session_id,
input_messages=last_turn_messages,
output_message=output_message,
started_at=last_turn_start_time,
completed_at=datetime.now().astimezone().isoformat(),
steps=steps,
)
await self.storage.add_turn_to_session(request.session_id, turn)
if output_message.tool_calls:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnAwaitingInputPayload(
turn=turn,
)
)
)
else:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnCompletePayload(
turn=turn,
)
)
)
yield chunk yield chunk
assert output_message is not None
turn = Turn(
turn_id=turn_id,
session_id=request.session_id,
input_messages=input_messages,
output_message=output_message,
started_at=start_time,
completed_at=datetime.now().astimezone().isoformat(),
steps=steps,
)
await self.storage.add_turn_to_session(request.session_id, turn)
if output_message.tool_calls:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnAwaitingInputPayload(
turn=turn,
)
)
)
else:
chunk = AgentTurnResponseStreamChunk(
event=AgentTurnResponseEvent(
payload=AgentTurnResponseTurnCompletePayload(
turn=turn,
)
)
)
yield chunk
async def run( async def run(
self, self,
session_id: str, session_id: str,

File diff suppressed because one or more lines are too long

View file

@ -44,6 +44,15 @@
"metadata": null "metadata": null
} }
}, },
"()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
"type": "value",
"value": {
"content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
"error_code": null,
"error_message": null,
"metadata": null
}
},
"()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": { "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
"type": "value", "type": "value",
"value": { "value": {
@ -71,15 +80,6 @@
"metadata": null "metadata": null
} }
}, },
"()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation[\\'Year\\'], average_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation Rate\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
"type": "value",
"value": {
"content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in <module>\n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
"error_code": null,
"error_message": null,
"metadata": null
}
},
"()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": { "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
"type": "value", "type": "value",
"value": { "value": {
@ -107,23 +107,23 @@
"type": "text" "type": "text"
}, },
{ {
"text": "Result 1:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n", "text": "Result 1:\nDocument_id:cbc88\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 2:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "text": "Result 2:\nDocument_id:cbc88\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 3:\nDocument_id:0c95c\nContent: with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n LoRA to:\n\n * ``q_proj`` applies LoRA to the query projection layer.\n * ``k_proj`` applies LoRA to the key projection layer.\n * ``v_proj`` applies LoRA to the value projection layer.\n * ``output_proj`` applies LoRA to the attention output projection layer.\n\n Whilst adding more layers to be fine-tuned may improve model accuracy,\n this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n This is usually a projection to vocabulary space (e.g. in language models), but\n other modelling tasks may have different projections - classifier models will project\n to the number of classes, for example\n\n.. note::\n\n Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.llama3.lora_llama3_8b\n apply_lora_to_mlp: True\n model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n", "text": "Result 3:\nDocument_id:8892b\nContent: with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n LoRA to:\n\n * ``q_proj`` applies LoRA to the query projection layer.\n * ``k_proj`` applies LoRA to the key projection layer.\n * ``v_proj`` applies LoRA to the value projection layer.\n * ``output_proj`` applies LoRA to the attention output projection layer.\n\n Whilst adding more layers to be fine-tuned may improve model accuracy,\n this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n This is usually a projection to vocabulary space (e.g. in language models), but\n other modelling tasks may have different projections - classifier models will project\n to the number of classes, for example\n\n.. note::\n\n Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.llama3.lora_llama3_8b\n apply_lora_to_mlp: True\n model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 4:\nDocument_id:64211\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "text": "Result 4:\nDocument_id:cbc88\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 5:\nDocument_id:1d70c\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n checkpointer.checkpoint_dir=<checkpoint_dir> \\\n tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n", "text": "Result 5:\nDocument_id:9dcb7\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n checkpointer.checkpoint_dir=<checkpoint_dir> \\\n tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
"type": "text" "type": "text"
}, },
{ {
@ -135,11 +135,11 @@
"error_message": null, "error_message": null,
"metadata": { "metadata": {
"document_ids": [ "document_ids": [
"6421150d-d334-4163-a058-3818b2b742e9", "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
"6421150d-d334-4163-a058-3818b2b742e9", "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
"0c95cff3-5612-40cf-a73d-77644a2462d0", "8892b092-6394-471e-b143-a23c6cc374f8",
"6421150d-d334-4163-a058-3818b2b742e9", "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
"1d70c86d-4cdf-4be9-a1f2-8a271b15ce2c" "9dcb747d-0627-40cc-a23c-0bee2b6b05af"
] ]
} }
} }
@ -307,23 +307,23 @@
"type": "text" "type": "text"
}, },
{ {
"text": "Result 1:\nDocument_id:7bdfa\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n", "text": "Result 1:\nDocument_id:f76dc\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 2:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n", "text": "Result 2:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 3:\nDocument_id:0c95c\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n", "text": "Result 3:\nDocument_id:de2d4\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 4:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "text": "Result 4:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 5:\nDocument_id:0c95c\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n", "text": "Result 5:\nDocument_id:de2d4\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
"type": "text" "type": "text"
}, },
{ {
@ -335,11 +335,11 @@
"error_message": null, "error_message": null,
"metadata": { "metadata": {
"document_ids": [ "document_ids": [
"7bdfad34-d546-4e98-9757-a0289696cd97", "f76dc7f5-9648-4272-a579-c8387fb1408a",
"6421150d-d334-4163-a058-3818b2b742e9", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"0c95cff3-5612-40cf-a73d-77644a2462d0", "de2d49de-55de-44dd-9bca-6f4f6d633b0a",
"6421150d-d334-4163-a058-3818b2b742e9", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"0c95cff3-5612-40cf-a73d-77644a2462d0" "de2d49de-55de-44dd-9bca-6f4f6d633b0a"
] ]
} }
} }
@ -362,23 +362,23 @@
"type": "text" "type": "text"
}, },
{ {
"text": "Result 1:\nDocument_id:7da0c\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n", "text": "Result 1:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune<overview_label>`\n * Make sure to :ref:`install torchtune<install_label>`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 2:\nDocument_id:7da0c\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "text": "Result 2:\nDocument_id:c4fc3\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 3:\nDocument_id:7da0c\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "text": "Result 3:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 4:\nDocument_id:7da0c\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n", "text": "Result 4:\nDocument_id:c4fc3\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
"type": "text" "type": "text"
}, },
{ {
"text": "Result 5:\nDocument_id:7da0c\nContent: ,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n", "text": "Result 5:\nDocument_id:c4fc3\nContent: ,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
"type": "text" "type": "text"
}, },
{ {
@ -390,11 +390,11 @@
"error_message": null, "error_message": null,
"metadata": { "metadata": {
"document_ids": [ "document_ids": [
"7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f", "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
"7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f" "c4fc3cb6-6172-489e-90a7-b39d343e14c0"
] ]
} }
} }