From 4da8378ff8c8c3033c77fd77c80e9a9ca41a5b9f Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sat, 22 Mar 2025 15:32:50 -0700 Subject: [PATCH] remove notebook --- docs/notebooks/RAG_as_attchements.ipynb | 297 ------------------------ 1 file changed, 297 deletions(-) delete mode 100644 docs/notebooks/RAG_as_attchements.ipynb diff --git a/docs/notebooks/RAG_as_attchements.ipynb b/docs/notebooks/RAG_as_attchements.ipynb deleted file mode 100644 index e8b90aa6d..000000000 --- a/docs/notebooks/RAG_as_attchements.ipynb +++ /dev/null @@ -1,297 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_stack_client import LlamaStackClient\n", - "from llama_stack_client.types import Document\n", - "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n", - "from llama_stack_client.types.agent_create_params import AgentConfig\n", - "from llama_stack_client.lib.agents.agent import Agent\n", - "from rich.pretty import pprint\n", - "import json\n", - "import uuid\n", - "from pydantic import BaseModel\n", - "import rich\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_ID = \"meta-llama/Llama-3.3-70B-Instruct\"\n", - "\n", - "client = LlamaStackClient(\n", - " base_url=\"http://localhost:8321\",\n", - " provider_data={\n", - " \"fireworks_api_key\": os.environ[\"FIREWORKS_API_KEY\"]\n", - " }\n", - ")\n", - "\n", - "urls = [\n", - " \"memory_optimizations.rst\",\n", - " \"chat.rst\",\n", - " \"llama3.rst\",\n", - " \"datasets.rst\",\n", - " \"qat_finetune.rst\",\n", - " \"lora_finetune.rst\",\n", - "]\n", - "\n", - "attachments = [\n", - " {\n", - " \"content\": f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n", - " \"mime_type\": \"text/plain\",\n", - " }\n", - "\n", - " for i, url in enumerate(urls)\n", - "]\n", - "\n", - "simple_agent = Agent(client, model=MODEL_ID, \n", - " instructions=\"You are a helpful assistant that can answer questions about the Torchtune project.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Turn(\n",
-       "input_messages=[\n",
-       "│   │   UserMessage(\n",
-       "│   │   │   content='What precision formats does torchtune support?',\n",
-       "│   │   │   role='user',\n",
-       "│   │   │   context='.. _memory_optimization_overview_label:\\n\\n============================\\nMemory Optimization Overview\\n============================\\n\\n**Author**: `Salman Mohammadi <https://github.com/SalmanMohammadi>`_\\n\\ntorchtune comes with a host of plug-and-play memory optimization components which give you lots of flexibility\\nto ``tune`` our recipes to your hardware. This page provides a brief glossary of these components and how you might use them.\\nTo make things easy, we\\'ve summarized these components in the following table:\\n\\n.. csv-table:: Memory optimization components\\n   :header: \"Component\", \"When to use?\"\\n   :widths: auto\\n\\n   \":ref:`glossary_precision`\", \"You\\'ll usually want to leave this as its default ``bfloat16``. It uses 2 bytes per model parameter instead of 4 bytes when using ``float32``.\"\\n   \":ref:`glossary_act_ckpt`\", \"Use when you\\'re memory constrained and want to use a larger model, batch size or context length. Be aware that it will slow down training speed.\"\\n   \":ref:`glossary_act_off`\", \"Similar to activation checkpointing, this can be used when memory constrained, but may decrease training speed. This **should** be used alongside activation checkpointing.\"\\n   \":ref:`glossary_grad_accm`\", \"Helpful when memory-constrained to simulate larger batch sizes. Not compatible with optimizer in backward. Use it when you can already fit at least one sample without OOMing, but not enough of them.\"\\n   \":ref:`glossary_low_precision_opt`\", \"Use when you want to reduce the size of the optimizer state. This is relevant when training large models and using optimizers with momentum, like Adam. Note that lower precision optimizers may reduce training stability/accuracy.\"\\n   \":ref:`glossary_opt_in_bwd`\", \"Use it when you have large gradients and can fit a large enough batch size, since this is not compatible with ``gradient_accumulation_steps``.\"\\n   \":ref:`glossary_cpu_offload`\", \"Offloads optimizer states and (optionally) gradients to CPU, and performs optimizer steps on CPU. This can be used to significantly reduce GPU memory usage at the cost of CPU RAM and training speed. Prioritize using it only if the other techniques are not enough.\"\\n   \":ref:`glossary_lora`\", \"When you want to significantly reduce the number of trainable parameters, saving gradient and optimizer memory during training, and significantly speeding up training. This may reduce training accuracy\"\\n   \":ref:`glossary_qlora`\", \"When you are training a large model, since quantization will save 1.5 bytes * (# of model parameters), at the potential cost of some training speed and accuracy.\"\\n   \":ref:`glossary_dora`\", \"a variant of LoRA that may improve model performance at the cost of slightly more memory.\"\\n\\n\\n.. note::\\n\\n  In its current state, this tutorial is focused on single-device optimizations. Check in soon as we update this page\\n  for the latest memory optimization features for distributed fine-tuning.\\n\\n.. _glossary_precision:\\n\\n\\nModel Precision\\n---------------\\n\\n*What\\'s going on here?*\\n\\nWe use the term \"precision\" to refer to the underlying data type used to represent the model and optimizer parameters.\\nWe support two data types in torchtune:\\n\\n.. note::\\n\\n  We recommend diving into Sebastian Raschka\\'s `blogpost on mixed-precision techniques <https://sebastianraschka.com/blog/2023/llm-mixed-precision-copy.html>`_\\n  for a deeper understanding of concepts around precision and data formats.\\n\\n* ``fp32``, commonly referred to as \"full-precision\", uses 4 bytes per model and optimizer parameter.\\n* ``bfloat16``, referred to as \"half-precision\", uses 2 bytes per model and optimizer parameter - effectively half\\n  the memory of ``fp32``, and also improves training speed. Generally, if your hardware supports training with ``bfloat16``,\\n  we recommend using it - this is the default setting for our recipes.\\n\\n.. note::\\n\\n  Another common paradigm is \"mixed-precision\" training: where model weights are in ``bfloat16`` (or ``fp16``), and optimizer\\n  states are in ``fp32``. Currently, we don\\'t support mixed-precision training in torchtune.\\n\\n*Sounds great! How do I use it?*\\n\\nSimply use the ``dtype`` flag or config entry in all our recipes! For example, to use half-precision training in ``bf16``,\\nset ``dtype=bf16``.\\n\\n.. _glossary_act_ckpt:\\n\\nActivation Checkpointing\\n------------------------\\n\\n*What\\'s going on here?*\\n\\nThe relevant section in the `PyTorch documentation <https://pytorch.org/docs/stable/checkpoint.html>`_ explains this concept well.\\nTo quote:\\n\\n  Activation checkpointing is a technique that trades compute for memory.\\n  Instead of keeping tensors needed for backward alive until they are used in\\n  gradient computation during backward, forward computation in checkpointed\\n  regions omits saving tensors for backward and recomputes them during the backward pass.\\n\\nThis setting is helpful for when you\\'re memory-constrained, especially due to larger batch sizes or longer context lengths.\\nHowever, these savings in memory come at the cost of training speed (i.e. tokens-per-second),\\nand in most cases training can slow down quite a bit as a result of this activation recomputation.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation checkpointing, use ``enable_activation_checkpointing=True``.\\n\\n.. _glossary_act_off:\\n\\nActivation Offloading\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nYou may have just read about activation checkpointing! Similar to checkpointing, offloading is a memory\\nefficiency technique that allows saving GPU VRAM by temporarily moving activations to CPU and bringing\\nthem back when needed in the backward pass.\\n\\nSee `PyTorch autograd hook tutorial <https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#saving-tensors-to-cpu>`_\\nfor more details about how this is implemented through :func:`torch.autograd.graph.saved_tensors_hooks`.\\n\\nThis setting is especially helpful for larger batch sizes, or longer context lengths when you\\'re memory constrained.\\nWhile of course it takes runtime and resources to move Tensors from GPU to CPU and back, the implementation in\\ntorchtune uses multiple CUDA streams (when available) in order to overlap the extra communication with the computation\\nto hide the extra runtime. As the communication workload is variable depending on the number and size of tensors being\\noffloaded, we do not recommend using it unless :ref:`glossary_act_ckpt` is also enabled, in which case only the checkpointed\\ntensors will be offloaded.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation offloading, use the ``enable_activation_offloading`` config entry or flag\\nin our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow\\nusage of streams, make sure you are on a torch version equal to or later than PyTorch.\\n\\n.. _glossary_grad_accm:\\n\\nGradient Accumulation\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nGradient accumulation allows you to simulate large batch sizes by *accumulating* gradients over several\\nbatches before updating model parameters using the optimizer. Concretely, the total number of samples used\\nfor a gradient update is when using gradient accumulation is:\\n\\n  ``total_batch_size = batch_size * gradient_accumulation_steps``\\n\\nFor example: with ``batch_size=1`` and ``gradient_accumulation_steps=32`` we get a total batch size of 32.\\n\\n.. note::\\n\\n  For other components in torchtune which use \"steps\", such as :ref:`metric logging <metric_logging_label>`, or\\n  :func:`learning rate schedulers <torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup>`, a \"step\" is counted as a\\n  single update to model parameters, rather than a single model forward pass with the data.\\n  Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``.\\n  Metrics would be logged every 10 global steps, which translates to every 40 model forward passes.\\n  For this reason, metric logging will appear less frequently when training with gradient accumulation,\\n  and progress bars may update more slowly.\\n\\n\\nIf you\\'re using one of our distributed recipes, simply multiply by the number of devices:\\n\\n  ``total_batch_size = batch_size * gradient_accumulation_steps * num_devices``\\n\\nGradient accumulation is especially useful when you can fit at least one sample in your GPU. In this case, artificially increasing the batch by\\naccumulating gradients might give you faster training speeds than using other memory optimization techniques that trade-off memory for speed, like :ref:`activation checkpointing <glossary_act_ckpt>`.\\n\\n*Sounds great! How do I use it?*\\n\\nAll of our finetuning recipes support simulating larger batch sizes by accumulating gradients. Just set the\\n``gradient_accumulation_steps`` flag or config entry.\\n\\n.. note::\\n\\n  Gradient accumulation should always be set to 1 when :ref:`fusing the optimizer step into the backward pass <glossary_opt_in_bwd>`.\\n\\nOptimizers\\n----------\\n\\n.. _glossary_low_precision_opt:\\n\\nLower Precision Optimizers\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nIn addition to :ref:`reducing model and optimizer precision <glossary_precision>` during training, we can further reduce precision in our optimizer states.\\nAll of our recipes support lower-precision optimizers from the `torchao <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim>`_ library.\\nFor single device recipes, we also support `bitsandbytes <https://huggingface.co/docs/bitsandbytes/main/en/index>`_.\\n\\nA good place to start might be the :class:`torchao.prototype.low_bit_optim.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.\\nBoth reduce memory by quantizing the optimizer state dict. Paged optimizers will also offload to CPU if there isn\\'t enough GPU memory available. In practice,\\nyou can expect higher memory savings from bnb\\'s PagedAdamW8bit but higher training speed from torchao\\'s AdamW8bit.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this in your recipes, make sure you have installed torchao (``pip install torchao``) or bitsandbytes (``pip install bitsandbytes``). Then, enable\\na low precision optimizer using the :ref:`cli_label`:\\n\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=torchao.prototype.low_bit_optim.AdamW8bit\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=bitsandbytes.optim.PagedAdamW8bit\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: bitsandbytes.optim.PagedAdamW8bit\\n    lr: 2e-5\\n\\n.. _glossary_opt_in_bwd:\\n\\nFusing Optimizer Step into Backward Pass\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nStateful optimizers (e.g. optimizers which use momentum) are the default in modern deep learning due to their stable convergence properties.\\nHowever, maintaining a state of gradient statistics comes at the cost of additional memory usage. An immediate alternative might be to\\nturn to stateless optimizers such as `stochastic gradient descent <https://pytorch.org/docs/stable/generated/torch.optim.SGD.html>`_\\nwithout momentum, which don\\'t require any additional memory usage, but will likely result in worse convergence during training.\\n\\nCan we find a middle ground here? Let\\'s consider a technique which enables the use of \"stateful\" optimizers such as `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_\\nwithout the memory overhead of gradient statistics, and without sacrificing their desirable convergence properties.\\nHow is this possible, you might ask? By *completely removing the buffer of gradients* which are stored by the optimizer during its ``step()``.\\n\\nTo understand how this works, we encourage you to read through the relevant PyTorch tutorial on this concept:\\n`How to save memory by fusing the optimizer step into the backward pass <https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html>`_.\\n\\n\\n*Sounds great! How do I use it?*\\n\\n.. todo ref full finetune recipe doc\\n\\nIn torchtune, you can enable this feature using the ``optimizer_in_bwd`` flag. This feature works best when using a stateful optimizer\\nwith a model with a lot of parameters, and when you don\\'t need to use :ref:`gradient accumulation <glossary_grad_accm>`.\\nYou won\\'t see meaningful impact when finetuning LoRA recipes, since in this case the number of parameters being updated are small.\\n\\n.. _glossary_cpu_offload:\\n\\nOffloading Optimizer/Gradient states to CPU\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nWe\\'ve mentioned above the concept of optimizer states - memory used by the stateful optimizers to maintain a state of gradient statistics, and\\nmodel gradients - tensors used to store gradients when we perform model backwards passes. We support using CPU offloading in our single-device recipes\\nthrough the `CPUOffloadOptimizer <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_ from ``torchao``.\\n\\nThis optimizer can wrap any base optimizer and works by keeping the optimizer states and performing the optimizer step on CPU, thus reducing\\nGPU memory usage by the size of the optimizer states. Additionally, we can also offload gradients to the CPU by using `offload_gradients=True`.\\n\\nIf finetuning on a single-device, another option is to use the ``PagedAdamW8bit`` from bitsandbytes, mentioned :ref:`above <glossary_low_precision_opt>`, which will *only* offload to CPU\\nwhen there is not enough GPU available.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this optimizer in your recipes, set the ``optimizer`` key in your config to :class:`torchao.prototype.low_bit_optim.CPUOffloadOptimizer`, which\\nwill use the :class:`torch.optim.AdamW` optimizer with ``fused=True`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=optimizer=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n  optimizer.offload_gradients=True \\\\\\n  lr=4e-5\\n\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n    offload_gradients: True\\n    # additional key-word arguments can be passed to torch.optim.AdamW\\n    lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer(\\n     model.parameters(), # your model here\\n     Adam,\\n     lr=1e-5,\\n     fused=True\\n )\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to (1) use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and (2) give GPU more work per optimizer step to amortize the offloading time (e.g. larger batch size with activation checkpointing, gradient accumulation).\\n* Gradient accumulation should always be set to 1 when ``offload_gradients=True``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``fsdp_cpu_offload=True`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 <https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md>`_ to see how they differ.\\n\\n\\n.. _glossary_peft:\\n\\nParameter Efficient Fine-Tuning (PEFT)\\n--------------------------------------\\n\\n.. _glossary_lora:\\n\\nLow Rank Adaptation (LoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n*What\\'s going on here?*\\n\\nYou can read our tutorial on :ref:`finetuning Llama2 with LoRA<lora_finetune_label>` to understand how LoRA works, and how to use it.\\nSimply stated, LoRA greatly reduces the number of trainable parameters, thus saving significant gradient and optimizer\\nmemory during training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using any of our recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device<lora_finetune_recipe_label>`. These recipes utilize\\nLoRA-enabled model builders, which we support for all our models, and also use the ``lora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3` model has a corresponding :func:`torchtune.models.llama3.lora_llama3`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of the LoRA decomposition, where ``lora_rank << in_dim`` and ``lora_rank << out_dim``\\n  \\\\- the dimensions of an arbitrary linear layer in the model. Concretely, ``lora_rank`` reduces the number of gradients stored\\n  in a linear fashion from ``in_dim * out_dim`` to ``lora_rank * (in_dim + out_dim)``. Typically, we have ``lora_rank in [8, 256]``.\\n* ``lora_alpha: float`` affects the magnitude of the LoRA updates. A larger alpha results in larger updates to the base model weights\\n  , potentially at the cost of training stability, conversely, smaller alpha can stabilize training at the cost of slower learning.\\n  We provide default settings for these parameters which we\\'ve tested with all of our models, but we encourage you to adjust them\\n  to your specific use case. Typically, one jointly changes ``lora_rank`` and ``lora_alpha`` together, where ``lora_alpha ~= 2*lora_rank``.\\n* ``lora_dropout`` introduces dropout in the LoRA layers to help regularize training. We default to 0.0 for all of our models.\\n\\nAs above, these parameters are also specified under the ``model`` flag or config entry:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"] \\\\\\n  model.lora_rank=32 \\\\\\n  model.lora_alpha=64\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n    lora_rank: 32\\n    lora_alpha: 64\\n\\n.. note::\\n\\n  To get a deeper sense of how LoRA parameters affect memory usage during training,\\n  see the :ref:`relevant section in our Llama2 LoRA tutorial<lora_tutorial_memory_tradeoff_label>`.\\n\\n.. _glossary_qlora:\\n\\nQuantized Low Rank Adaptation (QLoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`QLoRA <https://arxiv.org/abs/2305.14314>`_ is a memory enhancement on top of `LoRA <https://arxiv.org/abs/2106.09685>`_\\nthat maintains the frozen model parameters from LoRA in 4-bit quantized precision, thereby reducing memory usage.\\nThis is enabled through a novel  4-bit NormalFloat (NF4) data type proposed by the authors, which allows for 4-8x less\\nparameter memory usage whilst retaining model accuracy. You can read our tutorial on :ref:`finetuning Llama2 with QLoRA<qlora_finetune_label>`\\nfor a deeper understanding of how it works.\\n\\nWhen considering using QLoRA to reduce memory usage, it\\'s worth noting that QLoRA is slower than LoRA and may not be worth it if\\nthe model you are finetuning is small. In numbers, QLoRA saves roughly 1.5 bytes * (# of model parameters). Also, although QLoRA quantizes the model,\\nit minimizes accuracy degradation by up-casting quantized parameters to the original higher precision datatype during model forward passes - this up-casting may incur penalties to training speed.\\nThe :ref:`relevant section <qlora_compile_label>` in our QLoRA tutorial demonstrates the usage of ``torch.compile`` to address this by speeding up training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using QLoRA with any of our LoRA recipes, i.e. recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device<lora_finetune_recipe_label>`. These recipes utilize\\nQLoRA-enabled model builders, which we support for all our models, and also use the ``qlora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3_8b` model has a corresponding :func:`torchtune.models.llama3.qlora_llama3_8b`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with QLoRA quickly,\\njust specify any config with ``_qlora`` in its name.\\n\\nAll the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA <glossary_lora>`\\nto see how to configure these parameters.\\n\\nTo configure from the command line:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=32 \\\\\\n  model.lora_alpha=64\\n\\n\\nor, by modifying a config:\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.qlora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 32\\n    lora_alpha: 64\\n\\n.. _glossary_dora:\\n\\nWeight-Decomposed Low-Rank Adaptation (DoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`DoRA <https://arxiv.org/abs/2402.09353>`_ is another PEFT technique which builds on-top of LoRA by\\nfurther decomposing the pre-trained weights into two components: magnitude and direction. The magnitude component\\nis a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA decomposition and\\nupdates the orientation of weights.\\n\\nDoRA adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to\\nimprove the performance of LoRA, particularly at low ranks.\\n\\n*Sounds great! How do I use it?*\\n\\nMuch like LoRA and QLoRA, you can finetune using DoRA with any of our LoRA recipes. We use the same model builders for LoRA\\nas we do for DoRA, so you can use the ``lora_`` version of any model builder with ``use_dora=True``. For example, to finetune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n.. _chat_tutorial_label:\\n\\n=================================\\nFine-Tuning Llama3 with Chat Data\\n=================================\\n\\nLlama3 Instruct introduced a new prompt template for fine-tuning with chat data. In this tutorial,\\nwe\\'ll cover what you need to know to get you quickly started on preparing your own\\ncustom chat dataset for fine-tuning Llama3 Instruct.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:\\n\\n      * How the Llama3 Instruct format differs from Llama2\\n      * All about prompt templates and special tokens\\n      * How to use your own chat dataset to fine-tune Llama3 Instruct\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`configuring datasets<chat_dataset_usage_label>`\\n      * Know how to :ref:`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you\\'ll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_\\nthe template from Llama2 to better support multiturn conversations. The same text\\nin the Llama3 Instruct format would look like this:\\n\\n.. code-block:: text\\n\\n    <|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n    You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n    Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n    Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"content\": \"You are a helpful, respectful, and honest assistant.\",\\n        },\\n        {\\n            \"role\": \"user\",\\n            \"content\": \"Who are the most influential hip-hop artists of all time?\",\\n        },\\n        {\\n            \"role\": \"assistant\",\\n            \"content\": \"Here is a list of some of the most influential hip-hop \"\\n            \"artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\",\\n        },\\n    ]\\n\\nNow, let\\'s format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and\\nsee how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,\\nwhich simply structures a prompt with flavor text to indicate a certain task.\\n\\n.. code-block:: python\\n\\n    from torchtune.data import Llama2ChatTemplate, Message\\n\\n    messages = [Message.from_dict(msg) for msg in sample]\\n    formatted_messages = Llama2ChatTemplate.format(messages)\\n    print(formatted_messages)\\n    # [\\n    #     Message(\\n    #         role=\\'user\\',\\n    #         content=\\'[INST] <<SYS>>\\\\nYou are a helpful, respectful, and honest assistant.\\\\n<</SYS>>\\\\n\\\\nWho are the most influential hip-hop artists of all time? [/INST] \\',\\n    #         ...,\\n    #     ),\\n    #     Message(\\n    #         role=\\'assistant\\',\\n    #         content=\\'Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\\',\\n    #         ...,\\n    #     ),\\n    # ]\\n\\nThere are also special tokens used by Llama2, which are not in the prompt template.\\nIf you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you\\'ll notice that\\nwe don\\'t include the :code:`<s>` and :code:`</s>` tokens. These are the beginning-of-sequence\\n(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer\\nthan the rest of the prompt template. Let\\'s tokenize this example with the\\n:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see\\nwhy.\\n\\n.. code-block:: python\\n\\n    from torchtune.models.llama2 import llama2_tokenizer\\n\\n    tokenizer = llama2_tokenizer(\"/tmp/Llama-2-7b-hf/tokenizer.model\")\\n    user_message = formatted_messages[0].text_content\\n    tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True)\\n    print(tokens)\\n    # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2]\\n\\nWe\\'ve added the BOS and EOS tokens when encoding our example text. This shows up\\nas IDs 1 and 2. We can verify that these are our BOS and EOS tokens.\\n\\n.. code-block:: python\\n\\n    print(tokenizer._spm_model.spm_model.piece_to_id(\"<s>\"))\\n    # 1\\n    print(tokenizer._spm_model.spm_model.piece_to_id(\"</s>\"))\\n    # 2\\n\\nThe BOS and EOS tokens are what we call special tokens, because they have their own\\nreserved token IDs. This means that they will index to their own individual vectors in\\nthe model\\'s learnt embedding table. The rest of the prompt template tags, :code:`[INST]`\\nand :code:`<<SYS>>` are tokenized as normal text and not their own IDs.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.decode(518))\\n    # \\'[\\'\\n    print(tokenizer.decode(25580))\\n    # \\'INST\\'\\n    print(tokenizer.decode(29962))\\n    # \\']\\'\\n    print(tokenizer.decode([3532, 14816, 29903, 6778]))\\n    # \\'<<SYS>>\\'\\n\\nIt\\'s important to note that you should not place the special reserved tokens in your\\ninput prompts manually, as it will be treated as normal text and not as a special\\ntoken.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.encode(\"<s>\", add_bos=False, add_eos=False))\\n    # [529, 29879, 29958]\\n\\nNow let\\'s take a look at Llama3\\'s formatting to see how it\\'s tokenized differently\\nthan Llama2.\\n\\n.. code-block:: python\\n\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    messages = [Message.from_dict(msg) for msg in sample]\\n    tokens, mask = tokenizer.tokenize_messages(messages)\\n    print(tokenizer.decode(tokens))\\n    # \\'<|start_header_id|>system<|end_header_id|>\\\\n\\\\nYou are a helpful, respectful,\\n    # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\\\n\\\\nWho\\n    # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>\\n    # assistant<|end_header_id|>\\\\n\\\\nHere is a list of some of the most influential hip-hop\\n    # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>\\'\\n\\n.. note::\\n    We used the ``tokenize_messages`` API for Llama3, which is different than\\n    encode. It simply manages adding all the special tokens in the correct\\n    places after encoding the individual messages.\\n\\nWe can see that the tokenizer handled all the formatting without us specifying a prompt\\ntemplate. It turns out that all of the additional tags are special tokens, and we don\\'t require\\na separate prompt template. We can verify this by checking if the tags get encoded\\nas their own token IDs.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.special_tokens[\"<|begin_of_text|>\"])\\n    # 128000\\n    print(tokenizer.special_tokens[\"<|eot_id|>\"])\\n    # 128009\\n\\nThe best part is - all these special tokens are handled purely by the tokenizer.\\nThat means you won\\'t have to worry about messing up any required prompt templates!\\n\\n\\nWhen should I use a prompt template?\\n------------------------------------\\n\\nWhether or not to use a prompt template is governed by what your desired inference\\nbehavior is. You should use a prompt template if you are running inference on the\\nbase model and it was pre-trained with a prompt template, or you want to prime a\\nfine-tuned model to expect a certain prompt structure on inference for a specific task.\\n\\nIt is not strictly necessary to fine-tune with a prompt template, but generally\\nspecific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`\\nprovides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.\\nThis would wrap around the user message, with the assistant message untouched.\\n\\n.. code-block:: python\\n\\n    f\"Summarize this dialogue:\\\\n{dialogue}\\\\n---\\\\nSummary:\\\\n\"\\n\\nYou can fine-tune Llama2 with this template even though the model was originally pre-trained\\nwith the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model\\nsees during inference. The model should be robust enough to adapt to a new template.\\n\\n\\nFine-tuning on a custom chat dataset\\n------------------------------------\\n\\nLet\\'s test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom\\nchat dataset. We\\'ll walk through how to set up our data so that it can be tokenized\\ncorrectly and fed into our model.\\n\\nLet\\'s say we have a local dataset saved as a JSON file that contains conversations\\nwith an AI model. How can we get something like this into a format\\nLlama3 understands and tokenizes correctly?\\n\\n.. code-block:: python\\n\\n    # data/my_data.json\\n    [\\n        {\\n            \"dialogue\": [\\n                {\\n                    \"from\": \"human\",\\n                    \"value\": \"What is your name?\"\\n                },\\n                {\\n                    \"from\": \"gpt\",\\n                    \"value\": \"I am an AI assistant, I don\\'t have a name.\"\\n                },\\n                {\\n                    \"from\": \"human\",\\n                    \"value\": \"Pretend you have a name.\"\\n                },\\n                {\\n                    \"from\": \"gpt\",\\n                    \"value\": \"My name is Mark Zuckerberg.\"\\n                }\\n            ]\\n        },\\n    ]\\n\\nLet\\'s first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we\\nhave conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://docs.mistral.ai/getting-started/open_weight_models/#chat-template>`_.\\n\\nNow we\\'re ready to start fine-tuning! We\\'ll use the built-in LoRA single device recipe.\\nUse the :ref:`tune cp <tune_cp_cli_label>` command to get a copy of the :code:`8B_lora_single_device.yaml`\\nconfig and update it with your dataset configuration.\\n\\nLaunch the fine-tune!\\n\\n.. code-block:: bash\\n\\n    $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15\\n\\n.. _llama3_label:\\n\\n========================\\nMeta Llama3 in torchtune\\n========================\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:\\n\\n      * Download the Llama3-8B-Instruct weights and tokenizer\\n      * Fine-tune Llama3-8B-Instruct with LoRA and QLoRA\\n      * Evaluate your fine-tuned Llama3-8B-Instruct model\\n      * Generate text with your fine-tuned model\\n      * Quantize your model to speed up generation\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n\\n\\nLlama3-8B\\n---------\\n\\n`Meta Llama 3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let\\'s download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3-8B-Instruct \\\\\\n        --output-dir <checkpoint_dir> \\\\\\n        --hf-token <ACCESS TOKEN>\\n\\n|\\n\\nFine-tuning Llama3-8B-Instruct in torchtune\\n-------------------------------------------\\n\\ntorchtune provides `LoRA <https://arxiv.org/abs/2106.09685>`_, `QLoRA <https://arxiv.org/abs/2305.14314>`_, and full fine-tuning\\nrecipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial <lora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill be saved separately.\\n\\nIn our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.\\n\\nIf you have multiple GPUs available, you can run the distributed version of the recipe.\\ntorchtune makes use of the `FSDP <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_ APIs from PyTorch Distributed\\nto shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster overall training.\\nFor example, on two devices:\\n\\n.. code-block:: bash\\n\\n    tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora\\n\\nFinally, if we want to use even less memory, we can leverage torchtune\\'s QLoRA recipe via:\\n\\n.. TODO (SalmanMohammadi) ref qlora recipe page\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_qlora_single_device\\n\\nSince our default configs enable full bfloat16 training, all of the above commands can be run with\\ndevices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory\\nbelow 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.\\nTry it out!\\n\\n|\\n\\nEvaluating fine-tuned Llama3-8B models with EleutherAI\\'s Eval Harness\\n---------------------------------------------------------------------\\n\\nNow that we\\'ve fine-tuned our model, what\\'s next? Let\\'s take our LoRA-finetuned model from the\\npreceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.\\n\\nFirst, torchtune provides an integration with\\n`EleutherAI\\'s evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_\\nfor model evaluation on common benchmark tasks.\\n\\n.. note::\\n    Make sure you\\'ve first installed the evaluation harness via :code:`pip install \"lm_eval==0.4.*\"`.\\n\\nFor this tutorial we\\'ll use the `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_ task from the harness.\\nThis task measures a model\\'s propensity to be truthful when answering questions and\\nmeasures the model\\'s zero-shot accuracy on a question followed by one or more true\\nresponses and one or more false responses. First, let\\'s copy the config so we can point the YAML\\nfile to our fine-tuned checkpoint files.\\n\\n.. code-block:: bash\\n\\n    tune cp eleuther_evaluation ./custom_eval_config.yaml\\n\\nNext, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.\\n\\n.. code-block:: yaml\\n\\n    model:\\n      _component_: torchtune.models.llama3.llama3_8b\\n\\n    checkpointer:\\n      _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n      # directory with the checkpoint files\\n      # this should match the output_dir specified during\\n      # fine-tuning\\n      checkpoint_dir: <checkpoint_dir>\\n\\n      # checkpoint files for the fine-tuned model. These will be logged\\n      # at the end of your fine-tune\\n      checkpoint_files: [\\n        meta_model_0.pt\\n      ]\\n\\n      output_dir: <checkpoint_dir>\\n      model_type: LLAMA3\\n\\n    # Make sure to update the tokenizer path to the right\\n    # checkpoint directory as well\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <checkpoint_dir>/tokenizer.model\\n\\nFinally, we can run evaluation using our modified config.\\n\\n.. code-block:: bash\\n\\n    tune run eleuther_eval --config ./custom_eval_config.yaml\\n\\nTry it for yourself and see what accuracy your model gets!\\n\\n|\\n\\nGenerating text with our fine-tuned Llama3 model\\n------------------------------------------------\\n\\n.. TODO (SalmanMohammadi) ref generate recipe page\\n\\nNext, let\\'s look at one other way we can evaluate our model: generating text! torchtune provides a\\n`recipe for generation <https://github.com/pytorch/torchtune/blob/main/recipes/generate.py>`_ as well.\\n\\nSimilar to what we did, let\\'s copy and modify the default generation config.\\n\\n.. code-block:: bash\\n\\n    tune cp generation ./custom_generation_config.yaml\\n\\nNow we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.\\n\\n.. code-block:: yaml\\n\\n    model:\\n      _component_: torchtune.models.llama3.llama3_8b\\n\\n    checkpointer:\\n      _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n      # directory with the checkpoint files\\n      # this should match the output_dir specified during\\n      # fine-tuning\\n      checkpoint_dir: <checkpoint_dir>\\n\\n      # checkpoint files for the fine-tuned model. These will be logged\\n      # at the end of your fine-tune\\n      checkpoint_files: [\\n        meta_model_0.pt\\n      ]\\n\\n      output_dir: <checkpoint_dir>\\n      model_type: LLAMA3\\n\\n    # Make sure to update the tokenizer path to the right\\n    # checkpoint directory as well\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <checkpoint_dir>/tokenizer.model\\n\\nRunning generation with our LoRA-finetuned model, we see the following output:\\n\\n.. code-block:: bash\\n\\n    tune run generate --config ./custom_generation_config.yaml \\\\\\n    prompt.user=\"Hello, my name is\"\\n\\n    [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.\\n    ...\\n    [generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec\\n    [generate.py:138] Bandwidth achieved: 346.09 GB/s\\n    [generate.py:139] Memory used: 18.31 GB\\n\\nFaster generation via quantization\\n----------------------------------\\n\\nWe rely on `torchao <https://github.com/pytorch-labs/ao>`_ for `post-training quantization <https://github.com/pytorch/ao/tree/main/torchao/quantization#quantization>`_.\\nTo quantize the fine-tuned model after installing torchao we can run the following command::\\n\\n  # we also support `int8_weight_only()` and `int8_dynamic_activation_int8_weight()`, see\\n  # https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques\\n  # for a full list of techniques that we support\\n  from torchao.quantization.quant_api import quantize_, int4_weight_only\\n  quantize_(model, int4_weight_only())\\n\\nAfter quantization, we rely on torch.compile for speedups. For more details, please see `this example usage <https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#quantization-flow-example>`_.\\n\\ntorchao also provides `this table <https://github.com/pytorch/ao#inference>`_ listing performance and accuracy results for ``llama2`` and ``llama3``.\\n\\nFor Llama models, you can run generation directly in torchao on the quantized model using their ``generate.py`` script as\\ndiscussed in `this readme <https://github.com/pytorch/ao/tree/main/torchao/_models/llama>`_. This way you can compare your own results\\nto those in the previously-linked table.\\n\\n\\nThis is just the beginning of what you can do with Meta Llama3 using torchtune and the broader ecosystem.\\nWe look forward to seeing what you build!\\n\\n404: Not Found\\n.. _qat_finetune_label:\\n\\n===========================\\nFine-Tuning Llama3 with QAT\\n===========================\\n\\nQuantization-Aware Training (QAT) is a common technique for users to quantize their\\nmodels without incurring significant degradations in accuracy or perplexity. In this\\ntutorial, we’ll walk through how to apply QAT during fine-tuning, quantize the\\nresulting model, and evaluate your quantized model using torchtune.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What QAT is and how it helps reduce quantization degradation\\n      * How to run QAT during fine-tuning in torchtune\\n      * End-to-end example of connecting QAT, quantization, and evaluation recipes\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama3-8B model weights<download_llama_label>`\\n\\n.. _what_is_qat_label:\\n\\nWhat is QAT?\\n------------\\n\\n`Quantization-Aware Training <https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training>`_ (QAT) refers to simulating quantization numerics during\\ntraining or fine-tuning, with the end goal of ultimately producing a higher quality\\nquantized model compared to simple post-training quantization (PTQ). During QAT,\\nthe weights and/or activations are “fake quantized”, meaning they are transformed\\nas if they were being quantized, but kept in the original data type (e.g. bfloat16)\\nwithout being actually cast to lower bit-widths. Thus, fake quantization allows the\\nmodel to adjust for quantization noise when updating the weights, hence the training\\nprocess is “aware” that the model will ultimately be quantized after training.\\n\\n.. code-block:: python\\n\\n  # PTQ: x_q is quantized and cast to int8\\n  # scale and zero point (zp) refer to parameters used to quantize x_float\\n  # qmin and qmax refer to the range of quantized values\\n  x_q = (x_float / scale + zp).round().clamp(qmin, qmax).cast(int8)\\n\\n  # QAT: x_fq is still in float\\n  # Fake quantize simulates the numerics of quantize + dequantize\\n  x_fq = (x_float / scale + zp).round().clamp(qmin, qmax)\\n  x_fq = (x_fq - zp) * scale\\n\\nQAT typically involves applying a transformation to your model before and after training.\\nFor example, in the `torchao QAT implementation <https://github.com/pytorch/ao/blob/v0.2.0/torchao/quantization/prototype/qat.py>`_,\\nthese are represented as the ``prepare()`` and ``convert()`` steps: (1) ``prepare()`` inserts fake quantize\\noperations into linear layers, and (2) ``convert()`` transforms the fake quantize operations\\nto actual quantize and dequantize operations after training, thereby producing a quantized\\nmodel (dequantize operations are typically fused with linear after lowering).\\nBetween these two steps, training can proceed exactly as before.\\n\\n.. image:: /_static/img/qat_diagram.png\\n\\n.. _apply_qat_label:\\n\\nApplying QAT to Llama3 models\\n-----------------------------\\n\\nWe can easily apply the above QAT transformations to Llama3 for fine-tuning,\\nleveraging the APIs in torchao as follows:\\n\\n.. code-block:: python\\n\\n  import copy\\n  import torch\\n  from torchao.quantization import quantize_\\n  from torchao.quantization.qat import (\\n      FakeQuantizeConfig,\\n      IntXQuantizationAwareTrainingConfig,\\n  )\\n  from torchtune.models.llama3 import llama3_8b\\n\\n  model = llama3_8b()\\n  original_model = copy.deepcopy(model)\\n\\n  # Config for int8 dynamic asymmetric per token activations +\\n  # int4 symmetric per group weights, only for linear layers\\n  activation_config = FakeQuantizeConfig(torch.int8, \"per_token\", is_symmetric=False)\\n  weight_config = FakeQuantizeConfig(torch.int4, group_size=32)\\n  qat_config = IntXQuantizationAwareTrainingConfig(activation_config, weight_config)\\n\\n  # Prepare the model for quantization-aware fine-tuning.\\n  #\\n  # This step inserts \"fake quantize\" ops that simulate\\n  # quantization numerics during fine-tuning without\\n  # actually casting the activations/weights to lower-bit\\n  # dtypes like in \"real\" quantization.\\n  quantize_(model, qat_config)\\n\\n  prepared_model = model\\n\\nThe model is now ready for QAT fine-tuning! If we print the model we’ll see that\\nall linear layers have been swapped with :code:`FakeQuantizedLinear`, which simulates\\nthe numerics of int8 dynamic asymmetric per token activations + int4 symmetric\\nper group weights:\\n\\n.. code-block:: bash\\n\\n  >>> original_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n.. code-block:: bash\\n\\n  >>> prepared_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=4096, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (k_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=1024, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (v_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=1024, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (output_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=4096, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\nAfter fine-tuning, we can convert the model to get an actual quantized model:\\n\\n.. code-block:: python\\n\\n  from torchao.quantization.qat import (\\n      FromIntXQuantizationAwareTrainingConfig,\\n  )\\n  from torchao.quantization import (\\n      Int8DynamicActivationInt4WeightConfig,\\n  )\\n\\n  # Fine-tune as before\\n  train_loop(prepared_model)\\n\\n  # Convert the fake quantized model into an actual quantized model\\n  #\\n  # First, we swap `FakeQuantizedLinear` back to `torch.nn.Linear`\\n  # while keeping the QAT fine-tuned weights. Then, we perform standard\\n  # post-training quantization (PTQ), which inserts quantized activation\\n  # and weight tensor subclasses\\n  quantize_(prepared_model, FromIntXQuantizationAwareTrainingConfig())\\n  quantize_(prepared_model, Int8DynamicActivationInt4WeightConfig(group_size=32))\\n\\n  converted_model = prepared_model\\n\\nThe model is now fully quantized to int8 and int4 and ready for inference\\nor generation. If we print the model now, we will see the linear layers\\nare now swapped back to :code:`torch.nn.Linear`, but with quantized tensor\\nactivations and weights:\\n\\n.. code-block:: bash\\n\\n  >>> converted_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (k_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (v_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (output_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n\\nQAT finetuning recipe in torchtune\\n----------------------------------\\n\\nPutting it all together, we can now fine-tune a model using torchtune’s :ref:`QAT recipe<qat_distributed_recipe_label>`.\\nMake sure that you have first downloaded the Llama3 weights and tokenizer by\\nfollowing :ref:`these instructions<download_llama_label>`. In this tutorial,\\nwe use the following settings to demonstrate QAT’s effectiveness in recovering\\nquantization degradation compared to directly quantizing a model fine-tuned\\nwithout QAT. You can copy the default QAT config and make the following\\nmodifications accordingly:\\n\\n.. code-block:: bash\\n\\n  tune cp llama3/8B_qat_full custom_8B_qat_full.yaml\\n\\n.. code-block:: yaml\\n\\n  dataset:\\n    _component_: torchtune.datasets.text_completion_dataset\\n    source: allenai/c4\\n    column: text\\n    name: en\\n    split: train\\n\\n  ...\\n\\n  epochs: 1\\n  max_steps_per_epoch: 2000\\n  fake_quant_after_n_steps: 1000\\n\\nBy default, this uses the :code:`torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer`,\\nwhich uses the same fake quantization configurations as the example above.\\n\\nEmpirically, we observed that disabling fake quantization for the first N steps\\nled to better results, presumably because doing so allows the weights to stabilize\\nbefore we start introducing quantization noise to the fine-tuning process.\\nFor this reason, here we disable fake quantization for the first 1000 steps.\\n\\nYou can then use the following command to run fine-tuning with QAT using the above\\nconfig. This workload requires at least 6 GPUs, each with VRAM of at least 80GB.\\nBy default, this uses the int8 dynamic per token activations + int4 grouped per\\nchannel weights quantization configuration as shown above:\\n\\n.. code-block:: bash\\n\\n  tune run --nnodes 1 --nproc_per_node 6 qat_distributed --config custom_8B_qat_full.yaml\\n\\n.. note::\\n\\n  Make sure to point to the location of your Llama3 weights and tokenizer. This can be done\\n  either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n  or by directly modifying the :code:`8B_qat_full.yaml` file. See our :ref:`config_tutorial_label`\\n  for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n\\n  QAT introduces memory and computation overheads compared to regular fine-tuning,\\n  since fake quantization fundamentally involves extra ops and requires cloning\\n  the weights to avoid mutating them when computing the fake quantized values.\\n  In general, we expect around 30% decrease in fine-tuning speed for models like\\n  Llama3-8B. With activation checkpointing, the increase in memory footprint per\\n  GPU is minimal (< 5GB per GPU).\\n\\n\\nQuantizing the QAT model\\n------------------------\\n\\nNote that the QAT recipe above produces an unquantized bfloat16 model. The model\\nstructure is exactly the same as the model produced with regular full fine-tuning\\nwithout QAT, just with different weights. To actually get a quantized model,\\ncopy and make the following modifications to the quantization config:\\n\\n.. code-block:: bash\\n\\n  tune cp quantization custom_quantization.yaml\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.llama3_8b\\n\\n  checkpointer:\\n    _component_: torchtune.training.FullModelMetaCheckpointer\\n    checkpoint_dir: <your QAT checkpoint dir>\\n    checkpoint_files: [ft-model-00001-of-00001.bin]\\n    output_dir: <your QAT checkpoint dir>\\n    model_type: LLAMA3\\n\\n  ...\\n\\n  quantizer:\\n    _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer\\n    groupsize: 256\\n\\nThe following command performs the convert step in the QAT flow, which actually\\nquantizes the float model to a model with quantized weights:\\n\\n.. code-block:: bash\\n\\n  tune run quantize --config custom_quantization.yaml\\n\\n.. note::\\n\\n  Make sure to use the same QAT quantizer you used to fine-tune your model,\\n  otherwise the numerics will be off and the quantized model will perform poorly.\\n\\n.. _qat_eval_label:\\n\\nEvaluating the quantized model\\n------------------------------\\n\\nNow that we have a quantized model, we can run some evaluations on it and compare the\\nresults against regular fine-tuning without QAT (i.e. post-training quantization).\\nTo achieve this, we use `EleutherAI’s evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_\\nintegrated in torchtune. First, copy the evaluation config and make the following changes:\\n\\n.. code-block:: bash\\n\\n  tune cp eleuther_evaluation custom_eleuther_evaluation.yaml\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.llama3_8b\\n\\n  checkpointer:\\n    _component_: torchtune.training.FullModelTorchTuneCheckpointer\\n    checkpoint_dir: <your quantized model checkpoint dir>\\n    checkpoint_files: [ft-model-00001-of-00001-8da4w.bin]\\n    output_dir: <your quantized model checkpoint dir>\\n    model_type: LLAMA3\\n\\n  ...\\n\\n  tasks: [\"hellaswag\", \"wikitext\"]\\n\\n  quantizer:\\n    _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer\\n    groupsize: 256\\n\\n.. note::\\n\\n  Since we are passing in a quantized model, be sure to use the corresponding\\n  post-training quantizer instead of the QAT quantizer. For example, if you\\n  used the :code:`Int8DynActInt4WeightQATQuantizer` during fine-tuning, you\\n  should specify :code:`Int8DynActInt4WeightQuantizer` in this step. See the\\n  `quantization recipe <https://github.com/pytorch/torchtune/blob/main/recipes/quantize.py>`_\\n  for a full list of supported quantizers.\\n\\nNow run the evaluation recipe:\\n\\n.. code-block:: bash\\n\\n  tune run eleuther_eval --config my_eleuther_evaluation.yaml\\n\\nThe results should look something like this:\\n\\n.. code-block:: bash\\n\\n  # QAT quantized model evaluation results (int8 activations + int4 weights)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     |Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|-----:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|9.9148|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity|1.5357|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  |0.6189|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            |0.5687|±  |0.0049|\\n  |         |       |none  |     0|acc_norm       |0.7536|±  |0.0043|\\n\\nComparing these results to the model fine-tuned without QAT, we can see that\\nQAT was able to recover a significant portion of the quantization degradations\\nfrom the original unquantized model compared to PTQ. For example, normalized\\naccuracy in the hellaswag task dropped by 2.20% with PTQ but only 0.74% with\\nQAT when compared to the original unquantized model. Similarly, word perplexity\\nin the wikitext task increased by 2.048 with PTQ but only 1.190 with QAT (lower\\nis better).\\n\\n.. code-block:: bash\\n\\n  # PTQ quantized model evaluation results (int8 activations + int4 weights)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     | Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|------:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|10.7735|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity| 1.5598|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  | 0.6413|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            | 0.5481|±  |0.0050|\\n  |         |       |none  |     0|acc_norm       | 0.7390|±  |0.0044|\\n\\n.. code-block:: bash\\n\\n  # Float model evaluation results (bfloat16)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     |Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|-----:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|8.7251|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity|1.4994|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  |0.5844|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            |0.5740|±  |0.0049|\\n  |         |       |none  |     0|acc_norm       |0.7610|±  |0.0043|\\n\\nThus, the QAT flow produced a quantized model that outperforms the post-training\\nquantized model. Importantly, the quantized model structure is identical in both\\nflows, and so the model size, memory usage, and all other performance\\ncharacteristics are also the same.\\n\\nNote that although the weights are quantized to int4, the quantized model size\\nfor both the QAT and the PTQ flows are 8.187 GB, while the original float model\\nis 14.958 GB. This is because this quantizer uses int8 to represent the weights\\nas PyTorch does not have native int4 dtype support. A more efficient representation\\nis to pack the int4 weights, which will halve the quantized model size. This is\\nwhat the Int4WeightOnlyQuantizer does, and the corresponding QAT quantizer will\\nbe added in the future.\\n\\nLowering QAT model to device (optional)\\n---------------------------------------\\n\\nOne important motivation for quantizing a model is to be able to run it in resource\\nconstrained environments. You can further lower your QAT Llama3 model to edge devices\\nsuch as smartphones using `executorch <https://github.com/pytorch/executorch/>`_ by\\nfollowing `these instructions <https://github.com/pytorch/executorch/tree/main/examples/models/llama2>`_.\\nFor example, the following command lowers the model to the XNNPACK backend:\\n\\n.. code-block:: bash\\n\\n  python -m examples.models.llama2.export_llama --checkpoint <your QAT checkpoint> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 256 -d fp32 --metadata \\'{\"get_bos_id\":128000, \"get_eos_id\":128001}\\' --embedding-quantize 4,32 --output_name=\"llama3_8da4w.pte\"\\n\\nThis results in a much smaller quantized model of size 3.881 GB. When benchmarked on a OnePlus 12 smartphone, this model also achieved the same inference and generation speeds as the post-training quantized model. This is because the model structures are the same across the two flows:\\n\\n.. list-table::\\n   :widths: 25 25 25\\n   :header-rows: 1\\n\\n   * -\\n     - QAT\\n     - PTQ\\n   * - Quantized model size\\n     - 3.881 GB\\n     - 3.881 GB\\n   * - Inference speed\\n     - 9.709 tok/s\\n     - 9.815 tok/s\\n   * - Generation speed\\n     - 11.316 tok/s\\n     - 11.364 tok/s\\n\\n.. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network\\'s remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer\\'s self-attention.\\n\\n.. note::\\n\\n    If you\\'re unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_,\\nyou can expect to see further memory savings from the optimizer state.\\n\\n.. note::\\n\\n    LoRA memory savings come primarily from gradient and optimizer states,\\n    so if your model\\'s peak memory comes in its :code:`forward()` method, then LoRA\\n    may not reduce peak memory.\\n\\nHow does LoRA work?\\n-------------------\\n\\nLoRA replaces weight update matrices with a low-rank approximation. In general, weight updates\\nfor an arbitrary :code:`nn.Linear(in_dim,out_dim)` layer could have rank as high as\\n:code:`min(in_dim,out_dim)`. LoRA (and other related papers such as `Aghajanyan et al. <https://arxiv.org/abs/2012.13255>`_)\\nhypothesize that the `intrinsic dimension <https://en.wikipedia.org/wiki/Intrinsic_dimension>`_\\nof these updates during LLM fine-tuning can in fact be much lower.\\nTo take advantage of this property, LoRA finetuning will freeze the original model,\\nthen add a trainable weight update from a low-rank projection. More explicitly, LoRA trains two\\nmatrices :code:`A` and :code:`B`. :code:`A` projects the inputs down to a much smaller rank (often four or eight in practice), and\\n:code:`B` projects back up to the dimension output by the original linear layer.\\n\\nThe image below gives a simplified representation of a single weight update step from a full finetune\\n(on the left) compared to a weight update step with LoRA (on the right). The LoRA matrices :code:`A` and :code:`B`\\nserve as an approximation to the full rank weight update in blue.\\n\\n.. image:: /_static/img/lora_diagram.png\\n\\nAlthough LoRA introduces a few extra parameters in the model :code:`forward()`, only the :code:`A` and :code:`B` matrices are trainable.\\nThis means that with a rank :code:`r` LoRA decomposition, the number of gradients we need to store reduces\\nfrom :code:`in_dim*out_dim` to :code:`r*(in_dim+out_dim)`. (Remember that in general :code:`r`\\nis much smaller than :code:`in_dim` and :code:`out_dim`.)\\n\\nFor example, in the 7B Llama2\\'s self-attention, :code:`in_dim=out_dim=4096` for the Q, K,\\nand V projections. This means a LoRA decomposition of rank :code:`r=8` will reduce the number of trainable\\nparameters for a given projection from :math:`4096 * 4096 \\\\approx 15M` to :math:`8 * 8192 \\\\approx 65K`, a\\nreduction of over 99%.\\n\\nLet\\'s take a look at a minimal implementation of LoRA in native PyTorch.\\n\\n\\n.. code-block:: python\\n\\n  import torch\\n  from torch import nn\\n\\n  class LoRALinear(nn.Module):\\n    def __init__(\\n      self,\\n      in_dim: int,\\n      out_dim: int,\\n      rank: int,\\n      alpha: float,\\n      dropout: float\\n    ):\\n      # These are the weights from the original pretrained model\\n      self.linear = nn.Linear(in_dim, out_dim, bias=False)\\n\\n      # These are the new LoRA params. In general rank << in_dim, out_dim\\n      self.lora_a = nn.Linear(in_dim, rank, bias=False)\\n      self.lora_b = nn.Linear(rank, out_dim, bias=False)\\n\\n      # Rank and alpha are commonly-tuned hyperparameters\\n      self.rank = rank\\n      self.alpha = alpha\\n\\n      # Most implementations also include some dropout\\n      self.dropout = nn.Dropout(p=dropout)\\n\\n      # The original params are frozen, and only LoRA params are trainable.\\n      self.linear.weight.requires_grad = False\\n      self.lora_a.weight.requires_grad = True\\n      self.lora_b.weight.requires_grad = True\\n\\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\\n      # This would be the output of the original model\\n      frozen_out = self.linear(x)\\n\\n      # lora_a projects inputs down to the much smaller self.rank,\\n      # then lora_b projects back up to the output dimension\\n      lora_out = self.lora_b(self.lora_a(self.dropout(x)))\\n\\n      # Finally, scale by the alpha parameter (normalized by rank)\\n      # and add to the original model\\'s outputs\\n      return frozen_out + (self.alpha / self.rank) * lora_out\\n\\nThere are some other details around initialization which we omit here, but if you\\'d like to know more\\nyou can see our implementation in :class:`~torchtune.modules.peft.LoRALinear`.\\nNow that we understand what LoRA is doing, let\\'s look at how we can apply it to our favorite models.\\n\\nApplying LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n      (lora_a): Linear(in_features=4096, out_features=8, bias=False)\\n      (lora_b): Linear(in_features=8, out_features=4096, bias=False)\\n    )\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n      (lora_a): Linear(in_features=4096, out_features=8, bias=False)\\n      (lora_b): Linear(in_features=8, out_features=4096, bias=False)\\n    )\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n\\nNotice that our LoRA model\\'s layer contains additional weights in the Q and V projections,\\nas expected. Additionally, inspecting the type of :code:`lora_model` and\\n:code:`base_model`, would show that they are both instances of the same :class:`~torchtune.modules.TransformerDecoder`.\\n(Feel free to verify this for yourself.)\\n\\nWhy does this matter? torchtune makes it easy to load checkpoints for LoRA directly from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#bfloat16_floating-point_format>`_\\nfloating-point format. This can be done via the command:\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama2/7B_lora_single_device\\n\\nOn a single device, we may need to be more cognizant of our peak memory. Let\\'s run a few experiments\\nto see our peak memory during a finetune. We will experiment along two axes:\\nfirst, which model layers have LoRA applied, and second, the rank of each LoRA layer. (We will scale\\nalpha in parallel to LoRA rank, as discussed above.)\\n\\nTo compare the results of our experiments, we can evaluate our models on `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_, a task from\\nthe `TruthfulQA <https://arxiv.org/abs/2109.07958>`_ benchmark for language models. For more details on how to run this and other evaluation tasks\\nwith torchtune\\'s EleutherAI evaluation harness integration, see our :ref:`End-to-End Workflow Tutorial <eval_harness_label>`.\\n\\nPreviously, we only enabled LoRA for the linear layers in each self-attention module, but in fact there are other linear\\nlayers we can apply LoRA to: MLP layers and our model\\'s final output projection. Note that for Llama-2-7B the final output\\nprojection maps to the vocabulary dimension (32000 instead of 4096 as in the other linear layers), so enabling LoRA for this layer will increase\\nour peak memory a bit more than the other layers. We can make the following changes to our config:\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'k_proj\\', \\'v_proj\\', \\'output_proj\\']\\n    apply_lora_to_mlp: True\\n    apply_lora_to_output: True\\n  ...\\n\\n.. note::\\n    All the finetuning runs below use the `llama2/7B_lora_single_device <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama2/7B_lora_single_device.yaml>`_\\n    config, which has a default batch size of 2. Modifying the batch size (or other hyperparameters, e.g. the optimizer) will impact both peak memory\\n    and final evaluation results.\\n\\n.. list-table::\\n   :widths: 25 25 25 25 25\\n   :header-rows: 1\\n\\n   * - LoRA Layers\\n     - Rank\\n     - Alpha\\n     - Peak Memory\\n     - Accuracy (truthfulqa_mc2)\\n   * - Q and V only\\n     - 8\\n     - 16\\n     - **15.57 GB**\\n     - 0.475\\n   * - all layers\\n     - 8\\n     - 16\\n     - 15.87 GB\\n     - 0.508\\n   * - Q and V only\\n     - 64\\n     - 128\\n     - 15.86 GB\\n     - 0.504\\n   * - all layers\\n     - 64\\n     - 128\\n     - 17.04 GB\\n     - **0.514**\\n\\nWe can see that our baseline settings give the lowest peak memory, but our evaluation performance is relatively lower.\\nBy enabling LoRA for all linear layers and increasing the rank to 64, we see almost a 4% absolute improvement\\nin our accuracy on this task, but our peak memory also increases by about 1.4GB. These are just a couple simple\\nexperiments; we encourage you to run your own finetunes to find the right tradeoff for your particular setup.\\n\\nAdditionally, if you want to decrease your model\\'s peak memory even further (and still potentially achieve similar\\nmodel quality results), you can check out our :ref:`QLoRA tutorial<qlora_finetune_label>`.\\n'\n",
-       "│   │   )\n",
-       "],\n",
-       "output_message=CompletionMessage(\n",
-       "│   │   content='Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision). The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.',\n",
-       "│   │   role='assistant',\n",
-       "│   │   stop_reason='end_of_turn',\n",
-       "│   │   tool_calls=[]\n",
-       "),\n",
-       "session_id='6910f07f-f8e0-407b-8441-60a90e7b1834',\n",
-       "started_at=datetime.datetime(2025, 3, 22, 19, 29, 16, 883581, tzinfo=TzInfo(UTC)),\n",
-       "steps=[\n",
-       "│   │   InferenceStep(\n",
-       "│   │   │   api_model_response=CompletionMessage(\n",
-       "│   │   │   │   content='Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision). The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.',\n",
-       "│   │   │   │   role='assistant',\n",
-       "│   │   │   │   stop_reason='end_of_turn',\n",
-       "│   │   │   │   tool_calls=[]\n",
-       "│   │   │   ),\n",
-       "│   │   │   step_id='49409ea3-4a4d-4433-aa71-e6e4ec1bb054',\n",
-       "│   │   │   step_type='inference',\n",
-       "│   │   │   turn_id='212541bc-0cfa-4f04-a8a5-25fe2892bc8f',\n",
-       "│   │   │   completed_at=datetime.datetime(2025, 3, 22, 19, 29, 19, 144218, tzinfo=TzInfo(UTC)),\n",
-       "│   │   │   started_at=datetime.datetime(2025, 3, 22, 19, 29, 17, 267803, tzinfo=TzInfo(UTC))\n",
-       "│   │   )\n",
-       "],\n",
-       "turn_id='212541bc-0cfa-4f04-a8a5-25fe2892bc8f',\n",
-       "completed_at=datetime.datetime(2025, 3, 22, 19, 29, 19, 155387, tzinfo=TzInfo(UTC)),\n",
-       "output_attachments=[]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mTurn\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[33minput_messages\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mUserMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'What precision formats does torchtune support?'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'user'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mcontext\u001b[0m=\u001b[32m'.. _memory_optimization_overview_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\nMemory Optimization Overview\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\n\\n**Author**: `Salman Mohammadi \u001b[0m\u001b[32m<\u001b[0m\u001b[32mhttps:\u001b[0m\u001b[32m//github.com/SalmanMohammadi>`_\\n\\ntorchtune comes with a host of plug-and-play memory optimization components which give you lots of flexibility\\nto ``tune`` our recipes to your hardware. This page provides a brief glossary of these components and how you might use them.\\nTo make things easy, we\\'ve summarized these components in the following table:\\n\\n.. csv-table:: Memory optimization components\\n :header: \"Component\", \"When to use?\"\\n :widths: auto\\n\\n \":ref:`glossary_precision`\", \"You\\'ll usually want to leave this as its default ``bfloat16``. It uses 2 bytes per model parameter instead of 4 bytes when using ``float32``.\"\\n \":ref:`glossary_act_ckpt`\", \"Use when you\\'re memory constrained and want to use a larger model, batch size or context length. Be aware that it will slow down training speed.\"\\n \":ref:`glossary_act_off`\", \"Similar to activation checkpointing, this can be used when memory constrained, but may decrease training speed. This **should** be used alongside activation checkpointing.\"\\n \":ref:`glossary_grad_accm`\", \"Helpful when memory-constrained to simulate larger batch sizes. Not compatible with optimizer in backward. Use it when you can already fit at least one sample without OOMing, but not enough of them.\"\\n \":ref:`glossary_low_precision_opt`\", \"Use when you want to reduce the size of the optimizer state. This is relevant when training large models and using optimizers with momentum, like Adam. Note that lower precision optimizers may reduce training stability/accuracy.\"\\n \":ref:`glossary_opt_in_bwd`\", \"Use it when you have large gradients and can fit a large enough batch size, since this is not compatible with ``gradient_accumulation_steps``.\"\\n \":ref:`glossary_cpu_offload`\", \"Offloads optimizer states and \u001b[0m\u001b[32m(\u001b[0m\u001b[32moptionally\u001b[0m\u001b[32m)\u001b[0m\u001b[32m gradients to CPU, and performs optimizer steps on CPU. This can be used to significantly reduce GPU memory usage at the cost of CPU RAM and training speed. Prioritize using it only if the other techniques are not enough.\"\\n \":ref:`glossary_lora`\", \"When you want to significantly reduce the number of trainable parameters, saving gradient and optimizer memory during training, and significantly speeding up training. This may reduce training accuracy\"\\n \":ref:`glossary_qlora`\", \"When you are training a large model, since quantization will save 1.5 bytes * \u001b[0m\u001b[32m(\u001b[0m\u001b[32m# of model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, at the potential cost of some training speed and accuracy.\"\\n \":ref:`glossary_dora`\", \"a variant of LoRA that may improve model performance at the cost of slightly more memory.\"\\n\\n\\n.. note::\\n\\n In its current state, this tutorial is focused on single-device optimizations. Check in soon as we update this page\\n for the latest memory optimization features for distributed fine-tuning.\\n\\n.. _glossary_precision:\\n\\n\\nModel Precision\\n---------------\\n\\n*What\\'s going on here?*\\n\\nWe use the term \"precision\" to refer to the underlying data type used to represent the model and optimizer parameters.\\nWe support two data types in torchtune:\\n\\n.. note::\\n\\n We recommend diving into Sebastian Raschka\\'s `blogpost on mixed-precision techniques `_\\n for a deeper understanding of concepts around precision and data formats.\\n\\n* ``fp32``, commonly referred to as \"full-precision\", uses 4 bytes per model and optimizer parameter.\\n* ``bfloat16``, referred to as \"half-precision\", uses 2 bytes per model and optimizer parameter - effectively half\\n the memory of ``fp32``, and also improves training speed. Generally, if your hardware supports training with ``bfloat16``,\\n we recommend using it - this is the default setting for our recipes.\\n\\n.. note::\\n\\n Another common paradigm is \"mixed-precision\" training: where model weights are in ``bfloat16`` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mor ``fp16``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and optimizer\\n states are in ``fp32``. Currently, we don\\'t support mixed-precision training in torchtune.\\n\\n*Sounds great! How do I use it?*\\n\\nSimply use the ``dtype`` flag or config entry in all our recipes! For example, to use half-precision training in ``bf16``,\\nset ``\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mbf16\u001b[0m\u001b[32m``.\\n\\n.. _glossary_act_ckpt:\\n\\nActivation Checkpointing\\n------------------------\\n\\n*What\\'s going on here?*\\n\\nThe relevant section in the `PyTorch documentation `_ explains this concept well.\\nTo quote:\\n\\n Activation checkpointing is a technique that trades compute for memory.\\n Instead of keeping tensors needed for backward alive until they are used in\\n gradient computation during backward, forward computation in checkpointed\\n regions omits saving tensors for backward and recomputes them during the backward pass.\\n\\nThis setting is helpful for when you\\'re memory-constrained, especially due to larger batch sizes or longer context lengths.\\nHowever, these savings in memory come at the cost of training speed \u001b[0m\u001b[32m(\u001b[0m\u001b[32mi.e. tokens-per-second\u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\nand in most cases training can slow down quite a bit as a result of this activation recomputation.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation checkpointing, use ``\u001b[0m\u001b[32menable_activation_checkpointing\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_act_off:\\n\\nActivation Offloading\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nYou may have just read about activation checkpointing! Similar to checkpointing, offloading is a memory\\nefficiency technique that allows saving GPU VRAM by temporarily moving activations to CPU and bringing\\nthem back when needed in the backward pass.\\n\\nSee `PyTorch autograd hook tutorial `_\\nfor more details about how this is implemented through :func:`torch.autograd.graph.saved_tensors_hooks`.\\n\\nThis setting is especially helpful for larger batch sizes, or longer context lengths when you\\'re memory constrained.\\nWhile of course it takes runtime and resources to move Tensors from GPU to CPU and back, the implementation in\\ntorchtune uses multiple CUDA streams \u001b[0m\u001b[32m(\u001b[0m\u001b[32mwhen available\u001b[0m\u001b[32m)\u001b[0m\u001b[32m in order to overlap the extra communication with the computation\\nto hide the extra runtime. As the communication workload is variable depending on the number and size of tensors being\\noffloaded, we do not recommend using it unless :ref:`glossary_act_ckpt` is also enabled, in which case only the checkpointed\\ntensors will be offloaded.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation offloading, use the ``enable_activation_offloading`` config entry or flag\\nin our lora finetuning single device recipe, e.g. ``\u001b[0m\u001b[32menable_activation_offloading\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``. To allow\\nusage of streams, make sure you are on a torch version equal to or later than PyTorch.\\n\\n.. _glossary_grad_accm:\\n\\nGradient Accumulation\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nGradient accumulation allows you to simulate large batch sizes by *accumulating* gradients over several\\nbatches before updating model parameters using the optimizer. Concretely, the total number of samples used\\nfor a gradient update is when using gradient accumulation is:\\n\\n ``total_batch_size = batch_size * gradient_accumulation_steps``\\n\\nFor example: with ``\u001b[0m\u001b[32mbatch_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1\u001b[0m\u001b[32m`` and ``\u001b[0m\u001b[32mgradient_accumulation_steps\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m`` we get a total batch size of 32.\\n\\n.. note::\\n\\n For other components in torchtune which use \"steps\", such as :ref:`metric logging `, or\\n :func:`learning rate schedulers `, a \"step\" is counted as a\\n single update to model parameters, rather than a single model forward pass with the data.\\n Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``.\\n Metrics would be logged every 10 global steps, which translates to every 40 model forward passes.\\n For this reason, metric logging will appear less frequently when training with gradient accumulation,\\n and progress bars may update more slowly.\\n\\n\\nIf you\\'re using one of our distributed recipes, simply multiply by the number of devices:\\n\\n ``total_batch_size = batch_size * gradient_accumulation_steps * num_devices``\\n\\nGradient accumulation is especially useful when you can fit at least one sample in your GPU. In this case, artificially increasing the batch by\\naccumulating gradients might give you faster training speeds than using other memory optimization techniques that trade-off memory for speed, like :ref:`activation checkpointing `.\\n\\n*Sounds great! How do I use it?*\\n\\nAll of our finetuning recipes support simulating larger batch sizes by accumulating gradients. Just set the\\n``gradient_accumulation_steps`` flag or config entry.\\n\\n.. note::\\n\\n Gradient accumulation should always be set to 1 when :ref:`fusing the optimizer step into the backward pass `.\\n\\nOptimizers\\n----------\\n\\n.. _glossary_low_precision_opt:\\n\\nLower Precision Optimizers\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nIn addition to :ref:`reducing model and optimizer precision ` during training, we can further reduce precision in our optimizer states.\\nAll of our recipes support lower-precision optimizers from the `torchao `_ library.\\nFor single device recipes, we also support `bitsandbytes `_.\\n\\nA good place to start might be the :class:`torchao.prototype.low_bit_optim.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.\\nBoth reduce memory by quantizing the optimizer state dict. Paged optimizers will also offload to CPU if there isn\\'t enough GPU memory available. In practice,\\nyou can expect higher memory savings from bnb\\'s PagedAdamW8bit but higher training speed from torchao\\'s AdamW8bit.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this in your recipes, make sure you have installed torchao \u001b[0m\u001b[32m(\u001b[0m\u001b[32m``pip install torchao``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m or bitsandbytes \u001b[0m\u001b[32m(\u001b[0m\u001b[32m``pip install bitsandbytes``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Then, enable\\na low precision optimizer using the :ref:`cli_label`:\\n\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorchao\u001b[0m\u001b[32m.prototype.low_bit_optim.AdamW8bit\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mbitsandbytes\u001b[0m\u001b[32m.optim.PagedAdamW8bit\\n\\nor by directly :ref:`modifying a config file`:\\n\\n.. code-block:: yaml\\n\\n optimizer:\\n _component_: bitsandbytes.optim.PagedAdamW8bit\\n lr: 2e-5\\n\\n.. _glossary_opt_in_bwd:\\n\\nFusing Optimizer Step into Backward Pass\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nStateful optimizers \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. optimizers which use momentum\u001b[0m\u001b[32m)\u001b[0m\u001b[32m are the default in modern deep learning due to their stable convergence properties.\\nHowever, maintaining a state of gradient statistics comes at the cost of additional memory usage. An immediate alternative might be to\\nturn to stateless optimizers such as `stochastic gradient descent `_\\nwithout momentum, which don\\'t require any additional memory usage, but will likely result in worse convergence during training.\\n\\nCan we find a middle ground here? Let\\'s consider a technique which enables the use of \"stateful\" optimizers such as `AdamW `_\\nwithout the memory overhead of gradient statistics, and without sacrificing their desirable convergence properties.\\nHow is this possible, you might ask? By *completely removing the buffer of gradients* which are stored by the optimizer during its ``step\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m``.\\n\\nTo understand how this works, we encourage you to read through the relevant PyTorch tutorial on this concept:\\n`How to save memory by fusing the optimizer step into the backward pass `_.\\n\\n\\n*Sounds great! How do I use it?*\\n\\n.. todo ref full finetune recipe doc\\n\\nIn torchtune, you can enable this feature using the ``optimizer_in_bwd`` flag. This feature works best when using a stateful optimizer\\nwith a model with a lot of parameters, and when you don\\'t need to use :ref:`gradient accumulation `.\\nYou won\\'t see meaningful impact when finetuning LoRA recipes, since in this case the number of parameters being updated are small.\\n\\n.. _glossary_cpu_offload:\\n\\nOffloading Optimizer/Gradient states to CPU\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nWe\\'ve mentioned above the concept of optimizer states - memory used by the stateful optimizers to maintain a state of gradient statistics, and\\nmodel gradients - tensors used to store gradients when we perform model backwards passes. We support using CPU offloading in our single-device recipes\\nthrough the `CPUOffloadOptimizer `_ from ``torchao``.\\n\\nThis optimizer can wrap any base optimizer and works by keeping the optimizer states and performing the optimizer step on CPU, thus reducing\\nGPU memory usage by the size of the optimizer states. Additionally, we can also offload gradients to the CPU by using `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\nIf finetuning on a single-device, another option is to use the ``PagedAdamW8bit`` from bitsandbytes, mentioned :ref:`above `, which will *only* offload to CPU\\nwhen there is not enough GPU available.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this optimizer in your recipes, set the ``optimizer`` key in your config to :class:`torchao.prototype.low_bit_optim.CPUOffloadOptimizer`, which\\nwill use the :class:`torch.optim.AdamW` optimizer with ``\u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n optimizer.\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4e\u001b[0m\u001b[32m-5\\n\\n\\nor by directly :ref:`modifying a config file`:\\n\\n.. code-block:: yaml\\n\\n optimizer:\\n _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n offload_gradients: True\\n # additional key-word arguments can be passed to torch.optim.AdamW\\n lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, # your model here\\n Adam,\\n \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1e\u001b[0m\u001b[32m-5,\\n \u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page `_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m give GPU more work per optimizer step to amortize the offloading time \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. larger batch size with activation checkpointing, gradient accumulation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n* Gradient accumulation should always be set to 1 when ``\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``\u001b[0m\u001b[32mfsdp_cpu_offload\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 `_ to see how they differ.\\n\\n\\n.. _glossary_peft:\\n\\nParameter Efficient Fine-Tuning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPEFT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n--------------------------------------\\n\\n.. _glossary_lora:\\n\\nLow Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n*What\\'s going on here?*\\n\\nYou can read our tutorial on :ref:`finetuning Llama2 with LoRA` to understand how LoRA works, and how to use it.\\nSimply stated, LoRA greatly reduces the number of trainable parameters, thus saving significant gradient and optimizer\\nmemory during training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using any of our recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device`. These recipes utilize\\nLoRA-enabled model builders, which we support for all our models, and also use the ``lora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3` model has a corresponding :func:`torchtune.models.llama3.lora_llama3`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List\u001b[0m\u001b[32m[\u001b[0m\u001b[32mstr\u001b[0m\u001b[32m]\u001b[0m\u001b[32m`` accepts a list of strings specifying which layers of the model to apply\\n LoRA to:\\n\\n * ``q_proj`` applies LoRA to the query projection layer.\\n * ``k_proj`` applies LoRA to the key projection layer.\\n * ``v_proj`` applies LoRA to the value projection layer.\\n * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n Whilst adding more layers to be fine-tuned may improve model accuracy,\\n this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n This is usually a projection to vocabulary space \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. in language models\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, but\\n other modelling tasks may have different projections - classifier models will project\\n to the number of classes, for example\\n\\n.. note::\\n\\n Models which use tied embeddings \u001b[0m\u001b[32m(\u001b[0m\u001b[32msuch as Gemma and Qwen2 1.5B and 0.5B\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for the\\n final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.lora_llama3_8b\\n apply_lora_to_mlp: True\\n model.lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of the LoRA decomposition, where ``lora_rank << in_dim`` and ``lora_rank << out_dim``\\n \\\\- the dimensions of an arbitrary linear layer in the model. Concretely, ``lora_rank`` reduces the number of gradients stored\\n in a linear fashion from ``in_dim * out_dim`` to ``lora_rank * \u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim + out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m``. Typically, we have ``lora_rank in \u001b[0m\u001b[32m[\u001b[0m\u001b[32m8, 256\u001b[0m\u001b[32m]\u001b[0m\u001b[32m``.\\n* ``lora_alpha: float`` affects the magnitude of the LoRA updates. A larger alpha results in larger updates to the base model weights\\n , potentially at the cost of training stability, conversely, smaller alpha can stabilize training at the cost of slower learning.\\n We provide default settings for these parameters which we\\'ve tested with all of our models, but we encourage you to adjust them\\n to your specific use case. Typically, one jointly changes ``lora_rank`` and ``lora_alpha`` together, where ``lora_alpha ~= 2*lora_rank``.\\n* ``lora_dropout`` introduces dropout in the LoRA layers to help regularize training. We default to 0.0 for all of our models.\\n\\nAs above, these parameters are also specified under the ``model`` flag or config entry:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m64\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 32\\n lora_alpha: 64\\n\\n.. note::\\n\\n To get a deeper sense of how LoRA parameters affect memory usage during training,\\n see the :ref:`relevant section in our Llama2 LoRA tutorial`.\\n\\n.. _glossary_qlora:\\n\\nQuantized Low Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQLoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`QLoRA `_ is a memory enhancement on top of `LoRA `_\\nthat maintains the frozen model parameters from LoRA in 4-bit quantized precision, thereby reducing memory usage.\\nThis is enabled through a novel 4-bit NormalFloat \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNF4\u001b[0m\u001b[32m)\u001b[0m\u001b[32m data type proposed by the authors, which allows for 4-8x less\\nparameter memory usage whilst retaining model accuracy. You can read our tutorial on :ref:`finetuning Llama2 with QLoRA`\\nfor a deeper understanding of how it works.\\n\\nWhen considering using QLoRA to reduce memory usage, it\\'s worth noting that QLoRA is slower than LoRA and may not be worth it if\\nthe model you are finetuning is small. In numbers, QLoRA saves roughly 1.5 bytes * \u001b[0m\u001b[32m(\u001b[0m\u001b[32m# of model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Also, although QLoRA quantizes the model,\\nit minimizes accuracy degradation by up-casting quantized parameters to the original higher precision datatype during model forward passes - this up-casting may incur penalties to training speed.\\nThe :ref:`relevant section ` in our QLoRA tutorial demonstrates the usage of ``torch.compile`` to address this by speeding up training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using QLoRA with any of our LoRA recipes, i.e. recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device`. These recipes utilize\\nQLoRA-enabled model builders, which we support for all our models, and also use the ``qlora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3_8b` model has a corresponding :func:`torchtune.models.llama3.qlora_llama3_8b`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with QLoRA quickly,\\njust specify any config with ``_qlora`` in its name.\\n\\nAll the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA `\\nto see how to configure these parameters.\\n\\nTo configure from the command line:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m64\u001b[0m\u001b[32m\\n\\n\\nor, by modifying a config:\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.qlora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 32\\n lora_alpha: 64\\n\\n.. _glossary_dora:\\n\\nWeight-Decomposed Low-Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`DoRA `_ is another PEFT technique which builds on-top of LoRA by\\nfurther decomposing the pre-trained weights into two components: magnitude and direction. The magnitude component\\nis a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA decomposition and\\nupdates the orientation of weights.\\n\\nDoRA adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to\\nimprove the performance of LoRA, particularly at low ranks.\\n\\n*Sounds great! How do I use it?*\\n\\nMuch like LoRA and QLoRA, you can finetune using DoRA with any of our LoRA recipes. We use the same model builders for LoRA\\nas we do for DoRA, so you can use the ``lora_`` version of any model builder with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``. For example, to finetune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``\u001b[0m\u001b[32mquantize\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m16\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mquantize_base\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFSDP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n.. _chat_tutorial_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m=================================\\nFine-Tuning Llama3 with Chat Data\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m=================================\\n\\nLlama3 Instruct introduced a new prompt template for fine-tuning with chat data. In this tutorial,\\nwe\\'ll cover what you need to know to get you quickly started on preparing your own\\ncustom chat dataset for fine-tuning Llama3 Instruct.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:\\n\\n * How the Llama3 Instruct format differs from Llama2\\n * All about prompt templates and special tokens\\n * How to use your own chat dataset to fine-tune Llama3 Instruct\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`configuring datasets`\\n * Know how to :ref:`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you\\'ll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m/INST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `_\\nthe template from Llama2 to better support multiturn conversations. The same text\\nin the Llama3 Instruct format would look like this:\\n\\n.. code-block:: text\\n\\n <|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"system\",\\n \"content\": \"You are a helpful, respectful, and honest assistant.\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"user\",\\n \"content\": \"Who are the most influential hip-hop artists of all time?\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"assistant\",\\n \"content\": \"Here is a list of some of the most influential hip-hop \"\\n \"artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nNow, let\\'s format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and\\nsee how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,\\nwhich simply structures a prompt with flavor text to indicate a certain task.\\n\\n.. code-block:: python\\n\\n from torchtune.data import Llama2ChatTemplate, Message\\n\\n messages = \u001b[0m\u001b[32m[\u001b[0m\u001b[32mMessage.from_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmsg\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for msg in sample\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n formatted_messages = Llama2ChatTemplate.format\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmessages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mformatted_messages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n # Message\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32mrole\u001b[0m\u001b[32m=\\'user\\',\\n # \u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m=\\'\u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m <>\\\\nYou are a helpful, respectful, and honest assistant.\\\\n<>\\\\n\\\\nWho are the most influential hip-hop artists of all time? \u001b[0m\u001b[32m[\u001b[0m\u001b[32m/INST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\',\\n # ...,\\n # \u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\n # Message\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32mrole\u001b[0m\u001b[32m=\\'assistant\\',\\n # \u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m=\\'Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\\',\\n # ...,\\n # \u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\n # \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThere are also special tokens used by Llama2, which are not in the prompt template.\\nIf you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you\\'ll notice that\\nwe don\\'t include the :code:`` and :code:`` tokens. These are the beginning-of-sequence\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mBOS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and end-of-sequence \u001b[0m\u001b[32m(\u001b[0m\u001b[32mEOS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m tokens that are represented differently in the tokenizer\\nthan the rest of the prompt template. Let\\'s tokenize this example with the\\n:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see\\nwhy.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_tokenizer\\n\\n tokenizer = llama2_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Llama-2-7b-hf/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n user_message = formatted_messages\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.text_content\\n tokens = tokenizer.encode\u001b[0m\u001b[32m(\u001b[0m\u001b[32muser_message, \u001b[0m\u001b[32madd_bos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32madd_eos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokens\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nWe\\'ve added the BOS and EOS tokens when encoding our example text. This shows up\\nas IDs 1 and 2. We can verify that these are our BOS and EOS tokens.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer._spm_model.spm_model.piece_to_id\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 1\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer._spm_model.spm_model.piece_to_id\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 2\\n\\nThe BOS and EOS tokens are what we call special tokens, because they have their own\\nreserved token IDs. This means that they will index to their own individual vectors in\\nthe model\\'s learnt embedding table. The rest of the prompt template tags, :code:`\u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m`\\nand :code:`<>` are tokenized as normal text and not their own IDs.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m518\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m25580\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'INST\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m29962\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3532, 14816, 29903, 6778\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'<>\\'\\n\\nIt\\'s important to note that you should not place the special reserved tokens in your\\ninput prompts manually, as it will be treated as normal text and not as a special\\ntoken.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.encode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\", \u001b[0m\u001b[32madd_bos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m, \u001b[0m\u001b[32madd_eos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m529, 29879, 29958\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nNow let\\'s take a look at Llama3\\'s formatting to see how it\\'s tokenized differently\\nthan Llama2.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n messages = \u001b[0m\u001b[32m[\u001b[0m\u001b[32mMessage.from_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmsg\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for msg in sample\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n tokens, mask = tokenizer.tokenize_messages\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmessages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokens\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'<|start_header_id|>system<|end_header_id|>\\\\n\\\\nYou are a helpful, respectful,\\n # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\\\n\\\\nWho\\n # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>\\n # assistant<|end_header_id|>\\\\n\\\\nHere is a list of some of the most influential hip-hop\\n # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>\\'\\n\\n.. note::\\n We used the ``tokenize_messages`` API for Llama3, which is different than\\n encode. It simply manages adding all the special tokens in the correct\\n places after encoding the individual messages.\\n\\nWe can see that the tokenizer handled all the formatting without us specifying a prompt\\ntemplate. It turns out that all of the additional tags are special tokens, and we don\\'t require\\na separate prompt template. We can verify this by checking if the tags get encoded\\nas their own token IDs.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.special_tokens\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"<|begin_of_text|>\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 128000\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.special_tokens\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"<|eot_id|>\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 128009\\n\\nThe best part is - all these special tokens are handled purely by the tokenizer.\\nThat means you won\\'t have to worry about messing up any required prompt templates!\\n\\n\\nWhen should I use a prompt template?\\n------------------------------------\\n\\nWhether or not to use a prompt template is governed by what your desired inference\\nbehavior is. You should use a prompt template if you are running inference on the\\nbase model and it was pre-trained with a prompt template, or you want to prime a\\nfine-tuned model to expect a certain prompt structure on inference for a specific task.\\n\\nIt is not strictly necessary to fine-tune with a prompt template, but generally\\nspecific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`\\nprovides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.\\nThis would wrap around the user message, with the assistant message untouched.\\n\\n.. code-block:: python\\n\\n f\"Summarize this dialogue:\\\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mdialogue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\\\n---\\\\nSummary:\\\\n\"\\n\\nYou can fine-tune Llama2 with this template even though the model was originally pre-trained\\nwith the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model\\nsees during inference. The model should be robust enough to adapt to a new template.\\n\\n\\nFine-tuning on a custom chat dataset\\n------------------------------------\\n\\nLet\\'s test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom\\nchat dataset. We\\'ll walk through how to set up our data so that it can be tokenized\\ncorrectly and fed into our model.\\n\\nLet\\'s say we have a local dataset saved as a JSON file that contains conversations\\nwith an AI model. How can we get something like this into a format\\nLlama3 understands and tokenizes correctly?\\n\\n.. code-block:: python\\n\\n # data/my_data.json\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"dialogue\": \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"human\",\\n \"value\": \"What is your name?\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"gpt\",\\n \"value\": \"I am an AI assistant, I don\\'t have a name.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"human\",\\n \"value\": \"Pretend you have a name.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"gpt\",\\n \"value\": \"My name is Mark Zuckerberg.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nLet\\'s first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we\\nhave conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n ds = chat_dataset\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32msource\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"json\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mdata_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"data\u001b[0m\u001b[32m/my_data.json\",\\n \u001b[0m\u001b[32msplit\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"train\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mconversation_column\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"dialogue\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mconversation_style\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"sharegpt\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default \u001b[0m\u001b[32m(\u001b[0m\u001b[32m:class:`~torchtune.models.mistral.MistralChatTemplate`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to format\\nall messages according to their `recommendations `_.\\n\\nNow we\\'re ready to start fine-tuning! We\\'ll use the built-in LoRA single device recipe.\\nUse the :ref:`tune cp ` command to get a copy of the :code:`8B_lora_single_device.yaml`\\nconfig and update it with your dataset configuration.\\n\\nLaunch the fine-tune!\\n\\n.. code-block:: bash\\n\\n $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml \u001b[0m\u001b[32mepochs\u001b[0m\u001b[32m=\u001b[0m\u001b[32m15\u001b[0m\u001b[32m\\n\\n.. _llama3_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m========================\\nMeta Llama3 in torchtune\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m========================\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:\\n\\n * Download the Llama3-8B-Instruct weights and tokenizer\\n * Fine-tune Llama3-8B-Instruct with LoRA and QLoRA\\n * Evaluate your fine-tuned Llama3-8B-Instruct model\\n * Generate text with your fine-tuned model\\n * Quantize your model to speed up generation\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n\\n\\nLlama3-8B\\n---------\\n\\n`Meta Llama 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size \u001b[0m\u001b[32m(\u001b[0m\u001b[32m128,256 instead of 32,000 from Llama2 models\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n- Llama3-8B uses a different tokenizer than Llama2 models \u001b[0m\u001b[32m(\u001b[0m\u001b[32m`tiktoken `_ instead of `sentencepiece `_\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let\\'s download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3-8B-Instruct \\\\\\n --output-dir \\\\\\n --hf-token \\n\\n|\\n\\nFine-tuning Llama3-8B-Instruct in torchtune\\n-------------------------------------------\\n\\ntorchtune provides `LoRA `_, `QLoRA `_, and full fine-tuning\\nrecipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial `.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial `.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides ` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n checkpointer.\u001b[0m\u001b[32mcheckpoint_dir\u001b[0m\u001b[32m= \\\\\\n tokenizer.\u001b[0m\u001b[32mpath\u001b[0m\u001b[32m=/tokenizer.model \\\\\\n checkpointer.\u001b[0m\u001b[32moutput_dir\u001b[0m\u001b[32m=\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ```` used in the :ref:`tune download ` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive `.\\n\\n.. note::\\n To see the full set of configurable parameters for this \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand other\u001b[0m\u001b[32m)\u001b[0m\u001b[32m configs we can use :ref:`tune cp ` to copy \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand modify\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n the default config. :ref:`tune cp ` can be used with recipe scripts too, in case you want to make more custom changes\\n that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp ` see the section on\\n :ref:`modifying configs ` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmuch smaller\u001b[0m\u001b[32m)\u001b[0m\u001b[32m LoRA weights\\nwill be saved separately.\\n\\nIn our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.\\n\\nIf you have multiple GPUs available, you can run the distributed version of the recipe.\\ntorchtune makes use of the `FSDP `_ APIs from PyTorch Distributed\\nto shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster overall training.\\nFor example, on two devices:\\n\\n.. code-block:: bash\\n\\n tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora\\n\\nFinally, if we want to use even less memory, we can leverage torchtune\\'s QLoRA recipe via:\\n\\n.. TODO \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSalmanMohammadi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ref qlora recipe page\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_qlora_single_device\\n\\nSince our default configs enable full bfloat16 training, all of the above commands can be run with\\ndevices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory\\nbelow 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.\\nTry it out!\\n\\n|\\n\\nEvaluating fine-tuned Llama3-8B models with EleutherAI\\'s Eval Harness\\n---------------------------------------------------------------------\\n\\nNow that we\\'ve fine-tuned our model, what\\'s next? Let\\'s take our LoRA-finetuned model from the\\npreceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.\\n\\nFirst, torchtune provides an integration with\\n`EleutherAI\\'s evaluation harness `_\\nfor model evaluation on common benchmark tasks.\\n\\n.. note::\\n Make sure you\\'ve first installed the evaluation harness via :code:`pip install \"\u001b[0m\u001b[32mlm_eval\u001b[0m\u001b[32m==0.4.*\"`.\\n\\nFor this tutorial we\\'ll use the `truthfulqa_mc2 `_ task from the harness.\\nThis task measures a model\\'s propensity to be truthful when answering questions and\\nmeasures the model\\'s zero-shot accuracy on a question followed by one or more true\\nresponses and one or more false responses. First, let\\'s copy the config so we can point the YAML\\nfile to our fine-tuned checkpoint files.\\n\\n.. code-block:: bash\\n\\n tune cp eleuther_evaluation ./custom_eval_config.yaml\\n\\nNext, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n # directory with the checkpoint files\\n # this should match the output_dir specified during\\n # fine-tuning\\n checkpoint_dir: \\n\\n # checkpoint files for the fine-tuned model. These will be logged\\n # at the end of your fine-tune\\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n meta_model_0.pt\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n output_dir: \\n model_type: LLAMA3\\n\\n # Make sure to update the tokenizer path to the right\\n # checkpoint directory as well\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tokenizer.model\\n\\nFinally, we can run evaluation using our modified config.\\n\\n.. code-block:: bash\\n\\n tune run eleuther_eval --config ./custom_eval_config.yaml\\n\\nTry it for yourself and see what accuracy your model gets!\\n\\n|\\n\\nGenerating text with our fine-tuned Llama3 model\\n------------------------------------------------\\n\\n.. TODO \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSalmanMohammadi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ref generate recipe page\\n\\nNext, let\\'s look at one other way we can evaluate our model: generating text! torchtune provides a\\n`recipe for generation `_ as well.\\n\\nSimilar to what we did, let\\'s copy and modify the default generation config.\\n\\n.. code-block:: bash\\n\\n tune cp generation ./custom_generation_config.yaml\\n\\nNow we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n # directory with the checkpoint files\\n # this should match the output_dir specified during\\n # fine-tuning\\n checkpoint_dir: \\n\\n # checkpoint files for the fine-tuned model. These will be logged\\n # at the end of your fine-tune\\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n meta_model_0.pt\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n output_dir: \\n model_type: LLAMA3\\n\\n # Make sure to update the tokenizer path to the right\\n # checkpoint directory as well\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tokenizer.model\\n\\nRunning generation with our LoRA-finetuned model, we see the following output:\\n\\n.. code-block:: bash\\n\\n tune run generate --config ./custom_generation_config.yaml \\\\\\n prompt.\u001b[0m\u001b[32muser\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"Hello\u001b[0m\u001b[32m, my name is\"\\n\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:122\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.\\n ...\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:135\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Time for inference: 10.88 sec total, 18.94 tokens/sec\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:138\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Bandwidth achieved: 346.09 GB/s\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:139\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Memory used: 18.31 GB\\n\\nFaster generation via quantization\\n----------------------------------\\n\\nWe rely on `torchao `_ for `post-training quantization `_.\\nTo quantize the fine-tuned model after installing torchao we can run the following command::\\n\\n # we also support `int8_weight_only\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and `int8_dynamic_activation_int8_weight\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`, see\\n # https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques\\n # for a full list of techniques that we support\\n from torchao.quantization.quant_api import quantize_, int4_weight_only\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel, int4_weight_only\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nAfter quantization, we rely on torch.compile for speedups. For more details, please see `this example usage `_.\\n\\ntorchao also provides `this table `_ listing performance and accuracy results for ``llama2`` and ``llama3``.\\n\\nFor Llama models, you can run generation directly in torchao on the quantized model using their ``generate.py`` script as\\ndiscussed in `this readme `_. This way you can compare your own results\\nto those in the previously-linked table.\\n\\n\\nThis is just the beginning of what you can do with Meta Llama3 using torchtune and the broader ecosystem.\\nWe look forward to seeing what you build!\\n\\n404: Not Found\\n.. _qat_finetune_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m===========================\\nFine-Tuning Llama3 with QAT\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m===========================\\n\\nQuantization-Aware Training \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQAT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is a common technique for users to quantize their\\nmodels without incurring significant degradations in accuracy or perplexity. In this\\ntutorial, we’ll walk through how to apply QAT during fine-tuning, quantize the\\nresulting model, and evaluate your quantized model using torchtune.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What QAT is and how it helps reduce quantization degradation\\n * How to run QAT during fine-tuning in torchtune\\n * End-to-end example of connecting QAT, quantization, and evaluation recipes\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama3-8B model weights`\\n\\n.. _what_is_qat_label:\\n\\nWhat is QAT?\\n------------\\n\\n`Quantization-Aware Training `_ \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQAT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m refers to simulating quantization numerics during\\ntraining or fine-tuning, with the end goal of ultimately producing a higher quality\\nquantized model compared to simple post-training quantization \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPTQ\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. During QAT,\\nthe weights and/or activations are “fake quantized”, meaning they are transformed\\nas if they were being quantized, but kept in the original data type \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. bfloat16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nwithout being actually cast to lower bit-widths. Thus, fake quantization allows the\\nmodel to adjust for quantization noise when updating the weights, hence the training\\nprocess is “aware” that the model will ultimately be quantized after training.\\n\\n.. code-block:: python\\n\\n # PTQ: x_q is quantized and cast to int8\\n # scale and zero point \u001b[0m\u001b[32m(\u001b[0m\u001b[32mzp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m refer to parameters used to quantize x_float\\n # qmin and qmax refer to the range of quantized values\\n x_q = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_float / scale + zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.round\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.clamp\u001b[0m\u001b[32m(\u001b[0m\u001b[32mqmin, qmax\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.cast\u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # QAT: x_fq is still in float\\n # Fake quantize simulates the numerics of quantize + dequantize\\n x_fq = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_float / scale + zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.round\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.clamp\u001b[0m\u001b[32m(\u001b[0m\u001b[32mqmin, qmax\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n x_fq = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_fq - zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m * scale\\n\\nQAT typically involves applying a transformation to your model before and after training.\\nFor example, in the `torchao QAT implementation `_,\\nthese are represented as the ``prepare\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` and ``convert\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` steps: \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ``prepare\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` inserts fake quantize\\noperations into linear layers, and \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ``convert\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` transforms the fake quantize operations\\nto actual quantize and dequantize operations after training, thereby producing a quantized\\nmodel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdequantize operations are typically fused with linear after lowering\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\nBetween these two steps, training can proceed exactly as before.\\n\\n.. image:: /_static/img/qat_diagram.png\\n\\n.. _apply_qat_label:\\n\\nApplying QAT to Llama3 models\\n-----------------------------\\n\\nWe can easily apply the above QAT transformations to Llama3 for fine-tuning,\\nleveraging the APIs in torchao as follows:\\n\\n.. code-block:: python\\n\\n import copy\\n import torch\\n from torchao.quantization import quantize_\\n from torchao.quantization.qat import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n FakeQuantizeConfig,\\n IntXQuantizationAwareTrainingConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n from torchtune.models.llama3 import llama3_8b\\n\\n model = llama3_8b\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n original_model = copy.deepcopy\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Config for int8 dynamic asymmetric per token activations +\\n # int4 symmetric per group weights, only for linear layers\\n activation_config = FakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtorch.int8, \"per_token\", \u001b[0m\u001b[32mis_symmetric\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n weight_config = FakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtorch.int4, \u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n qat_config = IntXQuantizationAwareTrainingConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_config, weight_config\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Prepare the model for quantization-aware fine-tuning.\\n #\\n # This step inserts \"fake quantize\" ops that simulate\\n # quantization numerics during fine-tuning without\\n # actually casting the activations/weights to lower-bit\\n # dtypes like in \"real\" quantization.\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel, qat_config\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n prepared_model = model\\n\\nThe model is now ready for QAT fine-tuning! If we print the model we’ll see that\\nall linear layers have been swapped with :code:`FakeQuantizedLinear`, which simulates\\nthe numerics of int8 dynamic asymmetric per token activations + int4 symmetric\\nper group weights:\\n\\n.. code-block:: bash\\n\\n >>> original_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. code-block:: bash\\n\\n >>> prepared_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nAfter fine-tuning, we can convert the model to get an actual quantized model:\\n\\n.. code-block:: python\\n\\n from torchao.quantization.qat import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n FromIntXQuantizationAwareTrainingConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n from torchao.quantization import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n Int8DynamicActivationInt4WeightConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Fine-tune as before\\n train_loop\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Convert the fake quantized model into an actual quantized model\\n #\\n # First, we swap `FakeQuantizedLinear` back to `torch.nn.Linear`\\n # while keeping the QAT fine-tuned weights. Then, we perform standard\\n # post-training quantization \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPTQ\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, which inserts quantized activation\\n # and weight tensor subclasses\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model, FromIntXQuantizationAwareTrainingConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model, Int8DynamicActivationInt4WeightConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n converted_model = prepared_model\\n\\nThe model is now fully quantized to int8 and int4 and ready for inference\\nor generation. If we print the model now, we will see the linear layers\\nare now swapped back to :code:`torch.nn.Linear`, but with quantized tensor\\nactivations and weights:\\n\\n.. code-block:: bash\\n\\n >>> converted_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4096, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1024, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1024, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4096, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n\\nQAT finetuning recipe in torchtune\\n----------------------------------\\n\\nPutting it all together, we can now fine-tune a model using torchtune’s :ref:`QAT recipe`.\\nMake sure that you have first downloaded the Llama3 weights and tokenizer by\\nfollowing :ref:`these instructions`. In this tutorial,\\nwe use the following settings to demonstrate QAT’s effectiveness in recovering\\nquantization degradation compared to directly quantizing a model fine-tuned\\nwithout QAT. You can copy the default QAT config and make the following\\nmodifications accordingly:\\n\\n.. code-block:: bash\\n\\n tune cp llama3/8B_qat_full custom_8B_qat_full.yaml\\n\\n.. code-block:: yaml\\n\\n dataset:\\n _component_: torchtune.datasets.text_completion_dataset\\n source: allenai/c4\\n column: text\\n name: en\\n split: train\\n\\n ...\\n\\n epochs: 1\\n max_steps_per_epoch: 2000\\n fake_quant_after_n_steps: 1000\\n\\nBy default, this uses the :code:`torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer`,\\nwhich uses the same fake quantization configurations as the example above.\\n\\nEmpirically, we observed that disabling fake quantization for the first N steps\\nled to better results, presumably because doing so allows the weights to stabilize\\nbefore we start introducing quantization noise to the fine-tuning process.\\nFor this reason, here we disable fake quantization for the first 1000 steps.\\n\\nYou can then use the following command to run fine-tuning with QAT using the above\\nconfig. This workload requires at least 6 GPUs, each with VRAM of at least 80GB.\\nBy default, this uses the int8 dynamic per token activations + int4 grouped per\\nchannel weights quantization configuration as shown above:\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 6 qat_distributed --config custom_8B_qat_full.yaml\\n\\n.. note::\\n\\n Make sure to point to the location of your Llama3 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.\u001b[0m\u001b[32mcheckpoint_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32mmy_model_checkpoint_path\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32mtokenizer_checkpoint\u001b[0m\u001b[32m=\u001b[0m\u001b[32mmy_tokenizer_checkpoint_path\u001b[0m\u001b[32m`\\n or by directly modifying the :code:`8B_qat_full.yaml` file. See our :ref:`config_tutorial_label`\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n\\n QAT introduces memory and computation overheads compared to regular fine-tuning,\\n since fake quantization fundamentally involves extra ops and requires cloning\\n the weights to avoid mutating them when computing the fake quantized values.\\n In general, we expect around 30% decrease in fine-tuning speed for models like\\n Llama3-8B. With activation checkpointing, the increase in memory footprint per\\n GPU is minimal \u001b[0m\u001b[32m(\u001b[0m\u001b[32m< 5GB per GPU\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n\\nQuantizing the QAT model\\n------------------------\\n\\nNote that the QAT recipe above produces an unquantized bfloat16 model. The model\\nstructure is exactly the same as the model produced with regular full fine-tuning\\nwithout QAT, just with different weights. To actually get a quantized model,\\ncopy and make the following modifications to the quantization config:\\n\\n.. code-block:: bash\\n\\n tune cp quantization custom_quantization.yaml\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n checkpoint_dir: \\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32mft-model-00001-of-00001.bin\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n output_dir: \\n model_type: LLAMA3\\n\\n ...\\n\\n quantizer:\\n _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer\\n groupsize: 256\\n\\nThe following command performs the convert step in the QAT flow, which actually\\nquantizes the float model to a model with quantized weights:\\n\\n.. code-block:: bash\\n\\n tune run quantize --config custom_quantization.yaml\\n\\n.. note::\\n\\n Make sure to use the same QAT quantizer you used to fine-tune your model,\\n otherwise the numerics will be off and the quantized model will perform poorly.\\n\\n.. _qat_eval_label:\\n\\nEvaluating the quantized model\\n------------------------------\\n\\nNow that we have a quantized model, we can run some evaluations on it and compare the\\nresults against regular fine-tuning without QAT \u001b[0m\u001b[32m(\u001b[0m\u001b[32mi.e. post-training quantization\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\nTo achieve this, we use `EleutherAI’s evaluation harness `_\\nintegrated in torchtune. First, copy the evaluation config and make the following changes:\\n\\n.. code-block:: bash\\n\\n tune cp eleuther_evaluation custom_eleuther_evaluation.yaml\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelTorchTuneCheckpointer\\n checkpoint_dir: \\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32mft-model-00001-of-00001-8da4w.bin\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n output_dir: \\n model_type: LLAMA3\\n\\n ...\\n\\n tasks: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"hellaswag\", \"wikitext\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n quantizer:\\n _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer\\n groupsize: 256\\n\\n.. note::\\n\\n Since we are passing in a quantized model, be sure to use the corresponding\\n post-training quantizer instead of the QAT quantizer. For example, if you\\n used the :code:`Int8DynActInt4WeightQATQuantizer` during fine-tuning, you\\n should specify :code:`Int8DynActInt4WeightQuantizer` in this step. See the\\n `quantization recipe `_\\n for a full list of supported quantizers.\\n\\nNow run the evaluation recipe:\\n\\n.. code-block:: bash\\n\\n tune run eleuther_eval --config my_eleuther_evaluation.yaml\\n\\nThe results should look something like this:\\n\\n.. code-block:: bash\\n\\n # QAT quantized model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 activations + int4 weights\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|\\n |---------|------:|------|-----:|---------------|-----:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|9.9148|± |N/A |\\n | | |none | 0|byte_perplexity|1.5357|± |N/A |\\n | | |none | 0|bits_per_byte |0.6189|± |N/A |\\n |hellaswag| 1|none | 0|acc |0.5687|± |0.0049|\\n | | |none | 0|acc_norm |0.7536|± |0.0043|\\n\\nComparing these results to the model fine-tuned without QAT, we can see that\\nQAT was able to recover a significant portion of the quantization degradations\\nfrom the original unquantized model compared to PTQ. For example, normalized\\naccuracy in the hellaswag task dropped by 2.20% with PTQ but only 0.74% with\\nQAT when compared to the original unquantized model. Similarly, word perplexity\\nin the wikitext task increased by 2.048 with PTQ but only 1.190 with QAT \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlower\\nis better\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n.. code-block:: bash\\n\\n # PTQ quantized model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 activations + int4 weights\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric | Value | |Stderr|\\n |---------|------:|------|-----:|---------------|------:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|10.7735|± |N/A |\\n | | |none | 0|byte_perplexity| 1.5598|± |N/A |\\n | | |none | 0|bits_per_byte | 0.6413|± |N/A |\\n |hellaswag| 1|none | 0|acc | 0.5481|± |0.0050|\\n | | |none | 0|acc_norm | 0.7390|± |0.0044|\\n\\n.. code-block:: bash\\n\\n # Float model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mbfloat16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|\\n |---------|------:|------|-----:|---------------|-----:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|8.7251|± |N/A |\\n | | |none | 0|byte_perplexity|1.4994|± |N/A |\\n | | |none | 0|bits_per_byte |0.5844|± |N/A |\\n |hellaswag| 1|none | 0|acc |0.5740|± |0.0049|\\n | | |none | 0|acc_norm |0.7610|± |0.0043|\\n\\nThus, the QAT flow produced a quantized model that outperforms the post-training\\nquantized model. Importantly, the quantized model structure is identical in both\\nflows, and so the model size, memory usage, and all other performance\\ncharacteristics are also the same.\\n\\nNote that although the weights are quantized to int4, the quantized model size\\nfor both the QAT and the PTQ flows are 8.187 GB, while the original float model\\nis 14.958 GB. This is because this quantizer uses int8 to represent the weights\\nas PyTorch does not have native int4 dtype support. A more efficient representation\\nis to pack the int4 weights, which will halve the quantized model size. This is\\nwhat the Int4WeightOnlyQuantizer does, and the corresponding QAT quantizer will\\nbe added in the future.\\n\\nLowering QAT model to device \u001b[0m\u001b[32m(\u001b[0m\u001b[32moptional\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n---------------------------------------\\n\\nOne important motivation for quantizing a model is to be able to run it in resource\\nconstrained environments. You can further lower your QAT Llama3 model to edge devices\\nsuch as smartphones using `executorch `_ by\\nfollowing `these instructions `_.\\nFor example, the following command lowers the model to the XNNPACK backend:\\n\\n.. code-block:: bash\\n\\n python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 256 -d fp32 --metadata \\'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"get_bos_id\":128000, \"get_eos_id\":128001\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\' --embedding-quantize 4,32 --\u001b[0m\u001b[32moutput_name\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"llama3_8da4w\u001b[0m\u001b[32m.pte\"\\n\\nThis results in a much smaller quantized model of size 3.881 GB. When benchmarked on a OnePlus 12 smartphone, this model also achieved the same inference and generation speeds as the post-training quantized model. This is because the model structures are the same across the two flows:\\n\\n.. list-table::\\n :widths: 25 25 25\\n :header-rows: 1\\n\\n * -\\n - QAT\\n - PTQ\\n * - Quantized model size\\n - 3.881 GB\\n - 3.881 GB\\n * - Inference speed\\n - 9.709 tok/s\\n - 9.815 tok/s\\n * - Generation speed\\n - 11.316 tok/s\\n - 11.364 tok/s\\n\\n.. _lora_finetune_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\nFine-Tuning Llama2 with LoRA\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network\\'s remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer\\'s self-attention.\\n\\n.. note::\\n\\n If you\\'re unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas opposed to finetuning all model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `_,\\nyou can expect to see further memory savings from the optimizer state.\\n\\n.. note::\\n\\n LoRA memory savings come primarily from gradient and optimizer states,\\n so if your model\\'s peak memory comes in its :code:`forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` method, then LoRA\\n may not reduce peak memory.\\n\\nHow does LoRA work?\\n-------------------\\n\\nLoRA replaces weight update matrices with a low-rank approximation. In general, weight updates\\nfor an arbitrary :code:`nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim,out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` layer could have rank as high as\\n:code:`min\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim,out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand other related papers such as `Aghajanyan et al. `_\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nhypothesize that the `intrinsic dimension `_\\nof these updates during LLM fine-tuning can in fact be much lower.\\nTo take advantage of this property, LoRA finetuning will freeze the original model,\\nthen add a trainable weight update from a low-rank projection. More explicitly, LoRA trains two\\nmatrices :code:`A` and :code:`B`. :code:`A` projects the inputs down to a much smaller rank \u001b[0m\u001b[32m(\u001b[0m\u001b[32moften four or eight in practice\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and\\n:code:`B` projects back up to the dimension output by the original linear layer.\\n\\nThe image below gives a simplified representation of a single weight update step from a full finetune\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mon the left\u001b[0m\u001b[32m)\u001b[0m\u001b[32m compared to a weight update step with LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mon the right\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The LoRA matrices :code:`A` and :code:`B`\\nserve as an approximation to the full rank weight update in blue.\\n\\n.. image:: /_static/img/lora_diagram.png\\n\\nAlthough LoRA introduces a few extra parameters in the model :code:`forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`, only the :code:`A` and :code:`B` matrices are trainable.\\nThis means that with a rank :code:`r` LoRA decomposition, the number of gradients we need to store reduces\\nfrom :code:`in_dim*out_dim` to :code:`r*\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim+out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. \u001b[0m\u001b[32m(\u001b[0m\u001b[32mRemember that in general :code:`r`\\nis much smaller than :code:`in_dim` and :code:`out_dim`.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nFor example, in the 7B Llama2\\'s self-attention, :code:`\u001b[0m\u001b[32min_dim\u001b[0m\u001b[32m=\u001b[0m\u001b[32mout_dim\u001b[0m\u001b[32m=4096` for the Q, K,\\nand V projections. This means a LoRA decomposition of rank :code:`\u001b[0m\u001b[32mr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m` will reduce the number of trainable\\nparameters for a given projection from :math:`4096 * 4096 \\\\approx 15M` to :math:`8 * 8192 \\\\approx 65K`, a\\nreduction of over 99%.\\n\\nLet\\'s take a look at a minimal implementation of LoRA in native PyTorch.\\n\\n\\n.. code-block:: python\\n\\n import torch\\n from torch import nn\\n\\n class LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mnn.Module\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n def __init__\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n self,\\n in_dim: int,\\n out_dim: int,\\n rank: int,\\n alpha: float,\\n dropout: float\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n # These are the weights from the original pretrained model\\n self.linear = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim, out_dim, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # These are the new LoRA params. In general rank << in_dim, out_dim\\n self.lora_a = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim, rank, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n self.lora_b = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mrank, out_dim, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Rank and alpha are commonly-tuned hyperparameters\\n self.rank = rank\\n self.alpha = alpha\\n\\n # Most implementations also include some dropout\\n self.dropout = nn.Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # The original params are frozen, and only LoRA params are trainable.\\n self.linear.weight.requires_grad = False\\n self.lora_a.weight.requires_grad = True\\n self.lora_b.weight.requires_grad = True\\n\\n def forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself, x: torch.Tensor\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> torch.Tensor:\\n # This would be the output of the original model\\n frozen_out = self.linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # lora_a projects inputs down to the much smaller self.rank,\\n # then lora_b projects back up to the output dimension\\n lora_out = self.lora_b\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.lora_a\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Finally, scale by the alpha parameter \u001b[0m\u001b[32m(\u001b[0m\u001b[32mnormalized by rank\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # and add to the original model\\'s outputs\\n return frozen_out + \u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.alpha / self.rank\u001b[0m\u001b[32m)\u001b[0m\u001b[32m * lora_out\\n\\nThere are some other details around initialization which we omit here, but if you\\'d like to know more\\nyou can see our implementation in :class:`~torchtune.modules.peft.LoRALinear`.\\nNow that we understand what LoRA is doing, let\\'s look at how we can apply it to our favorite models.\\n\\nApplying LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.\\n # We can also set \u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m or \u001b[0m\u001b[32mapply_lora_to_output\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer\\'s self-attention in the usual Llama2 model\\n >>> print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mbase_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32m0\u001b[0m\u001b[32m.0, \u001b[0m\u001b[32minplace\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_a\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_b\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32m0\u001b[0m\u001b[32m.0, \u001b[0m\u001b[32minplace\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_a\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_b\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n\\nNotice that our LoRA model\\'s layer contains additional weights in the Q and V projections,\\nas expected. Additionally, inspecting the type of :code:`lora_model` and\\n:code:`base_model`, would show that they are both instances of the same :class:`~torchtune.modules.TransformerDecoder`.\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFeel free to verify this for yourself.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nWhy does this matter? torchtune makes it easy to load checkpoints for LoRA directly from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mbase_model.state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. note::\\n Whenever loading weights with :code:`\u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m `.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Set \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m on lora_params, and \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m on all others.\\n set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Print the total number of parameters\\n total_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n trainable_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m if p.requires_grad\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n f\"\"\"\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtotal_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m total params,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtrainable_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\" trainable params,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m(\u001b[0m\u001b[32m100.0 * trainable_params / total_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:.2f\u001b[0m\u001b[32m}\u001b[0m\u001b[32m% of all params are trainable.\\n \"\"\"\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas detailed :ref:`here`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs \u001b[0m\u001b[32m(\u001b[0m\u001b[32meach having VRAM of at least 16GB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.\u001b[0m\u001b[32mcheckpoint_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32mmy_model_checkpoint_path\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32mtokenizer_checkpoint\u001b[0m\u001b[32m=\u001b[0m\u001b[32mmy_tokenizer_checkpoint_path\u001b[0m\u001b[32m`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma\u001b[0m\u001b[32m)\u001b[0m\u001b[32m the number of GPUs you have available,\\n and \u001b[0m\u001b[32m(\u001b[0m\u001b[32mb\u001b[0m\u001b[32m)\u001b[0m\u001b[32m the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'q_proj\\', \\'v_proj\\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<`_\\nfloating-point format. This can be done via the command:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama2/7B_lora_single_device\\n\\nOn a single device, we may need to be more cognizant of our peak memory. Let\\'s run a few experiments\\nto see our peak memory during a finetune. We will experiment along two axes:\\nfirst, which model layers have LoRA applied, and second, the rank of each LoRA layer. \u001b[0m\u001b[32m(\u001b[0m\u001b[32mWe will scale\\nalpha in parallel to LoRA rank, as discussed above.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nTo compare the results of our experiments, we can evaluate our models on `truthfulqa_mc2 `_, a task from\\nthe `TruthfulQA `_ benchmark for language models. For more details on how to run this and other evaluation tasks\\nwith torchtune\\'s EleutherAI evaluation harness integration, see our :ref:`End-to-End Workflow Tutorial `.\\n\\nPreviously, we only enabled LoRA for the linear layers in each self-attention module, but in fact there are other linear\\nlayers we can apply LoRA to: MLP layers and our model\\'s final output projection. Note that for Llama-2-7B the final output\\nprojection maps to the vocabulary dimension \u001b[0m\u001b[32m(\u001b[0m\u001b[32m32000 instead of 4096 as in the other linear layers\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, so enabling LoRA for this layer will increase\\nour peak memory a bit more than the other layers. We can make the following changes to our config:\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'q_proj\\', \\'k_proj\\', \\'v_proj\\', \\'output_proj\\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n apply_lora_to_mlp: True\\n apply_lora_to_output: True\\n ...\\n\\n.. note::\\n All the finetuning runs below use the `llama2/7B_lora_single_device `_\\n config, which has a default batch size of 2. Modifying the batch size \u001b[0m\u001b[32m(\u001b[0m\u001b[32mor other hyperparameters, e.g. the optimizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m will impact both peak memory\\n and final evaluation results.\\n\\n.. list-table::\\n :widths: 25 25 25 25 25\\n :header-rows: 1\\n\\n * - LoRA Layers\\n - Rank\\n - Alpha\\n - Peak Memory\\n - Accuracy \u001b[0m\u001b[32m(\u001b[0m\u001b[32mtruthfulqa_mc2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n * - Q and V only\\n - 8\\n - 16\\n - **15.57 GB**\\n - 0.475\\n * - all layers\\n - 8\\n - 16\\n - 15.87 GB\\n - 0.508\\n * - Q and V only\\n - 64\\n - 128\\n - 15.86 GB\\n - 0.504\\n * - all layers\\n - 64\\n - 128\\n - 17.04 GB\\n - **0.514**\\n\\nWe can see that our baseline settings give the lowest peak memory, but our evaluation performance is relatively lower.\\nBy enabling LoRA for all linear layers and increasing the rank to 64, we see almost a 4% absolute improvement\\nin our accuracy on this task, but our peak memory also increases by about 1.4GB. These are just a couple simple\\nexperiments; we encourage you to run your own finetunes to find the right tradeoff for your particular setup.\\n\\nAdditionally, if you want to decrease your model\\'s peak memory even further \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand still potentially achieve similar\\nmodel quality results\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, you can check out our :ref:`QLoRA tutorial\u001b[0m\u001b[32m`.\\n'\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33moutput_message\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.'\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33msession_id\u001b[0m=\u001b[32m'6910f07f-f8e0-407b-8441-60a90e7b1834'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m16\u001b[0m, \u001b[1;36m883581\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33msteps\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mInferenceStep\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mapi_model_response\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'49409ea3-4a4d-4433-aa71-e6e4ec1bb054'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'inference'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'212541bc-0cfa-4f04-a8a5-25fe2892bc8f'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m144218\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m17\u001b[0m, \u001b[1;36m267803\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'212541bc-0cfa-4f04-a8a5-25fe2892bc8f'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m155387\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33moutput_attachments\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Session(\n",
-       "session_id='6910f07f-f8e0-407b-8441-60a90e7b1834',\n",
-       "session_name='simple_session_fd65775f-25c4-465b-86e9-7cee68d15130',\n",
-       "started_at=datetime.datetime(2025, 3, 22, 19, 29, 16, 862958, tzinfo=datetime.timezone.utc),\n",
-       "turns=[\n",
-       "│   │   Turn(\n",
-       "│   │   │   input_messages=[\n",
-       "│   │   │   │   UserMessage(\n",
-       "│   │   │   │   │   content='What precision formats does torchtune support?',\n",
-       "│   │   │   │   │   role='user',\n",
-       "│   │   │   │   │   context='.. _memory_optimization_overview_label:\\n\\n============================\\nMemory Optimization Overview\\n============================\\n\\n**Author**: `Salman Mohammadi <https://github.com/SalmanMohammadi>`_\\n\\ntorchtune comes with a host of plug-and-play memory optimization components which give you lots of flexibility\\nto ``tune`` our recipes to your hardware. This page provides a brief glossary of these components and how you might use them.\\nTo make things easy, we\\'ve summarized these components in the following table:\\n\\n.. csv-table:: Memory optimization components\\n   :header: \"Component\", \"When to use?\"\\n   :widths: auto\\n\\n   \":ref:`glossary_precision`\", \"You\\'ll usually want to leave this as its default ``bfloat16``. It uses 2 bytes per model parameter instead of 4 bytes when using ``float32``.\"\\n   \":ref:`glossary_act_ckpt`\", \"Use when you\\'re memory constrained and want to use a larger model, batch size or context length. Be aware that it will slow down training speed.\"\\n   \":ref:`glossary_act_off`\", \"Similar to activation checkpointing, this can be used when memory constrained, but may decrease training speed. This **should** be used alongside activation checkpointing.\"\\n   \":ref:`glossary_grad_accm`\", \"Helpful when memory-constrained to simulate larger batch sizes. Not compatible with optimizer in backward. Use it when you can already fit at least one sample without OOMing, but not enough of them.\"\\n   \":ref:`glossary_low_precision_opt`\", \"Use when you want to reduce the size of the optimizer state. This is relevant when training large models and using optimizers with momentum, like Adam. Note that lower precision optimizers may reduce training stability/accuracy.\"\\n   \":ref:`glossary_opt_in_bwd`\", \"Use it when you have large gradients and can fit a large enough batch size, since this is not compatible with ``gradient_accumulation_steps``.\"\\n   \":ref:`glossary_cpu_offload`\", \"Offloads optimizer states and (optionally) gradients to CPU, and performs optimizer steps on CPU. This can be used to significantly reduce GPU memory usage at the cost of CPU RAM and training speed. Prioritize using it only if the other techniques are not enough.\"\\n   \":ref:`glossary_lora`\", \"When you want to significantly reduce the number of trainable parameters, saving gradient and optimizer memory during training, and significantly speeding up training. This may reduce training accuracy\"\\n   \":ref:`glossary_qlora`\", \"When you are training a large model, since quantization will save 1.5 bytes * (# of model parameters), at the potential cost of some training speed and accuracy.\"\\n   \":ref:`glossary_dora`\", \"a variant of LoRA that may improve model performance at the cost of slightly more memory.\"\\n\\n\\n.. note::\\n\\n  In its current state, this tutorial is focused on single-device optimizations. Check in soon as we update this page\\n  for the latest memory optimization features for distributed fine-tuning.\\n\\n.. _glossary_precision:\\n\\n\\nModel Precision\\n---------------\\n\\n*What\\'s going on here?*\\n\\nWe use the term \"precision\" to refer to the underlying data type used to represent the model and optimizer parameters.\\nWe support two data types in torchtune:\\n\\n.. note::\\n\\n  We recommend diving into Sebastian Raschka\\'s `blogpost on mixed-precision techniques <https://sebastianraschka.com/blog/2023/llm-mixed-precision-copy.html>`_\\n  for a deeper understanding of concepts around precision and data formats.\\n\\n* ``fp32``, commonly referred to as \"full-precision\", uses 4 bytes per model and optimizer parameter.\\n* ``bfloat16``, referred to as \"half-precision\", uses 2 bytes per model and optimizer parameter - effectively half\\n  the memory of ``fp32``, and also improves training speed. Generally, if your hardware supports training with ``bfloat16``,\\n  we recommend using it - this is the default setting for our recipes.\\n\\n.. note::\\n\\n  Another common paradigm is \"mixed-precision\" training: where model weights are in ``bfloat16`` (or ``fp16``), and optimizer\\n  states are in ``fp32``. Currently, we don\\'t support mixed-precision training in torchtune.\\n\\n*Sounds great! How do I use it?*\\n\\nSimply use the ``dtype`` flag or config entry in all our recipes! For example, to use half-precision training in ``bf16``,\\nset ``dtype=bf16``.\\n\\n.. _glossary_act_ckpt:\\n\\nActivation Checkpointing\\n------------------------\\n\\n*What\\'s going on here?*\\n\\nThe relevant section in the `PyTorch documentation <https://pytorch.org/docs/stable/checkpoint.html>`_ explains this concept well.\\nTo quote:\\n\\n  Activation checkpointing is a technique that trades compute for memory.\\n  Instead of keeping tensors needed for backward alive until they are used in\\n  gradient computation during backward, forward computation in checkpointed\\n  regions omits saving tensors for backward and recomputes them during the backward pass.\\n\\nThis setting is helpful for when you\\'re memory-constrained, especially due to larger batch sizes or longer context lengths.\\nHowever, these savings in memory come at the cost of training speed (i.e. tokens-per-second),\\nand in most cases training can slow down quite a bit as a result of this activation recomputation.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation checkpointing, use ``enable_activation_checkpointing=True``.\\n\\n.. _glossary_act_off:\\n\\nActivation Offloading\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nYou may have just read about activation checkpointing! Similar to checkpointing, offloading is a memory\\nefficiency technique that allows saving GPU VRAM by temporarily moving activations to CPU and bringing\\nthem back when needed in the backward pass.\\n\\nSee `PyTorch autograd hook tutorial <https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#saving-tensors-to-cpu>`_\\nfor more details about how this is implemented through :func:`torch.autograd.graph.saved_tensors_hooks`.\\n\\nThis setting is especially helpful for larger batch sizes, or longer context lengths when you\\'re memory constrained.\\nWhile of course it takes runtime and resources to move Tensors from GPU to CPU and back, the implementation in\\ntorchtune uses multiple CUDA streams (when available) in order to overlap the extra communication with the computation\\nto hide the extra runtime. As the communication workload is variable depending on the number and size of tensors being\\noffloaded, we do not recommend using it unless :ref:`glossary_act_ckpt` is also enabled, in which case only the checkpointed\\ntensors will be offloaded.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation offloading, use the ``enable_activation_offloading`` config entry or flag\\nin our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow\\nusage of streams, make sure you are on a torch version equal to or later than PyTorch.\\n\\n.. _glossary_grad_accm:\\n\\nGradient Accumulation\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nGradient accumulation allows you to simulate large batch sizes by *accumulating* gradients over several\\nbatches before updating model parameters using the optimizer. Concretely, the total number of samples used\\nfor a gradient update is when using gradient accumulation is:\\n\\n  ``total_batch_size = batch_size * gradient_accumulation_steps``\\n\\nFor example: with ``batch_size=1`` and ``gradient_accumulation_steps=32`` we get a total batch size of 32.\\n\\n.. note::\\n\\n  For other components in torchtune which use \"steps\", such as :ref:`metric logging <metric_logging_label>`, or\\n  :func:`learning rate schedulers <torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup>`, a \"step\" is counted as a\\n  single update to model parameters, rather than a single model forward pass with the data.\\n  Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``.\\n  Metrics would be logged every 10 global steps, which translates to every 40 model forward passes.\\n  For this reason, metric logging will appear less frequently when training with gradient accumulation,\\n  and progress bars may update more slowly.\\n\\n\\nIf you\\'re using one of our distributed recipes, simply multiply by the number of devices:\\n\\n  ``total_batch_size = batch_size * gradient_accumulation_steps * num_devices``\\n\\nGradient accumulation is especially useful when you can fit at least one sample in your GPU. In this case, artificially increasing the batch by\\naccumulating gradients might give you faster training speeds than using other memory optimization techniques that trade-off memory for speed, like :ref:`activation checkpointing <glossary_act_ckpt>`.\\n\\n*Sounds great! How do I use it?*\\n\\nAll of our finetuning recipes support simulating larger batch sizes by accumulating gradients. Just set the\\n``gradient_accumulation_steps`` flag or config entry.\\n\\n.. note::\\n\\n  Gradient accumulation should always be set to 1 when :ref:`fusing the optimizer step into the backward pass <glossary_opt_in_bwd>`.\\n\\nOptimizers\\n----------\\n\\n.. _glossary_low_precision_opt:\\n\\nLower Precision Optimizers\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nIn addition to :ref:`reducing model and optimizer precision <glossary_precision>` during training, we can further reduce precision in our optimizer states.\\nAll of our recipes support lower-precision optimizers from the `torchao <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim>`_ library.\\nFor single device recipes, we also support `bitsandbytes <https://huggingface.co/docs/bitsandbytes/main/en/index>`_.\\n\\nA good place to start might be the :class:`torchao.prototype.low_bit_optim.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.\\nBoth reduce memory by quantizing the optimizer state dict. Paged optimizers will also offload to CPU if there isn\\'t enough GPU memory available. In practice,\\nyou can expect higher memory savings from bnb\\'s PagedAdamW8bit but higher training speed from torchao\\'s AdamW8bit.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this in your recipes, make sure you have installed torchao (``pip install torchao``) or bitsandbytes (``pip install bitsandbytes``). Then, enable\\na low precision optimizer using the :ref:`cli_label`:\\n\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=torchao.prototype.low_bit_optim.AdamW8bit\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=bitsandbytes.optim.PagedAdamW8bit\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: bitsandbytes.optim.PagedAdamW8bit\\n    lr: 2e-5\\n\\n.. _glossary_opt_in_bwd:\\n\\nFusing Optimizer Step into Backward Pass\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nStateful optimizers (e.g. optimizers which use momentum) are the default in modern deep learning due to their stable convergence properties.\\nHowever, maintaining a state of gradient statistics comes at the cost of additional memory usage. An immediate alternative might be to\\nturn to stateless optimizers such as `stochastic gradient descent <https://pytorch.org/docs/stable/generated/torch.optim.SGD.html>`_\\nwithout momentum, which don\\'t require any additional memory usage, but will likely result in worse convergence during training.\\n\\nCan we find a middle ground here? Let\\'s consider a technique which enables the use of \"stateful\" optimizers such as `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_\\nwithout the memory overhead of gradient statistics, and without sacrificing their desirable convergence properties.\\nHow is this possible, you might ask? By *completely removing the buffer of gradients* which are stored by the optimizer during its ``step()``.\\n\\nTo understand how this works, we encourage you to read through the relevant PyTorch tutorial on this concept:\\n`How to save memory by fusing the optimizer step into the backward pass <https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html>`_.\\n\\n\\n*Sounds great! How do I use it?*\\n\\n.. todo ref full finetune recipe doc\\n\\nIn torchtune, you can enable this feature using the ``optimizer_in_bwd`` flag. This feature works best when using a stateful optimizer\\nwith a model with a lot of parameters, and when you don\\'t need to use :ref:`gradient accumulation <glossary_grad_accm>`.\\nYou won\\'t see meaningful impact when finetuning LoRA recipes, since in this case the number of parameters being updated are small.\\n\\n.. _glossary_cpu_offload:\\n\\nOffloading Optimizer/Gradient states to CPU\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nWe\\'ve mentioned above the concept of optimizer states - memory used by the stateful optimizers to maintain a state of gradient statistics, and\\nmodel gradients - tensors used to store gradients when we perform model backwards passes. We support using CPU offloading in our single-device recipes\\nthrough the `CPUOffloadOptimizer <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_ from ``torchao``.\\n\\nThis optimizer can wrap any base optimizer and works by keeping the optimizer states and performing the optimizer step on CPU, thus reducing\\nGPU memory usage by the size of the optimizer states. Additionally, we can also offload gradients to the CPU by using `offload_gradients=True`.\\n\\nIf finetuning on a single-device, another option is to use the ``PagedAdamW8bit`` from bitsandbytes, mentioned :ref:`above <glossary_low_precision_opt>`, which will *only* offload to CPU\\nwhen there is not enough GPU available.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this optimizer in your recipes, set the ``optimizer`` key in your config to :class:`torchao.prototype.low_bit_optim.CPUOffloadOptimizer`, which\\nwill use the :class:`torch.optim.AdamW` optimizer with ``fused=True`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  optimizer=optimizer=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n  optimizer.offload_gradients=True \\\\\\n  lr=4e-5\\n\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n    offload_gradients: True\\n    # additional key-word arguments can be passed to torch.optim.AdamW\\n    lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer(\\n     model.parameters(), # your model here\\n     Adam,\\n     lr=1e-5,\\n     fused=True\\n )\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to (1) use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and (2) give GPU more work per optimizer step to amortize the offloading time (e.g. larger batch size with activation checkpointing, gradient accumulation).\\n* Gradient accumulation should always be set to 1 when ``offload_gradients=True``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``fsdp_cpu_offload=True`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 <https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md>`_ to see how they differ.\\n\\n\\n.. _glossary_peft:\\n\\nParameter Efficient Fine-Tuning (PEFT)\\n--------------------------------------\\n\\n.. _glossary_lora:\\n\\nLow Rank Adaptation (LoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n*What\\'s going on here?*\\n\\nYou can read our tutorial on :ref:`finetuning Llama2 with LoRA<lora_finetune_label>` to understand how LoRA works, and how to use it.\\nSimply stated, LoRA greatly reduces the number of trainable parameters, thus saving significant gradient and optimizer\\nmemory during training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using any of our recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device<lora_finetune_recipe_label>`. These recipes utilize\\nLoRA-enabled model builders, which we support for all our models, and also use the ``lora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3` model has a corresponding :func:`torchtune.models.llama3.lora_llama3`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of the LoRA decomposition, where ``lora_rank << in_dim`` and ``lora_rank << out_dim``\\n  \\\\- the dimensions of an arbitrary linear layer in the model. Concretely, ``lora_rank`` reduces the number of gradients stored\\n  in a linear fashion from ``in_dim * out_dim`` to ``lora_rank * (in_dim + out_dim)``. Typically, we have ``lora_rank in [8, 256]``.\\n* ``lora_alpha: float`` affects the magnitude of the LoRA updates. A larger alpha results in larger updates to the base model weights\\n  , potentially at the cost of training stability, conversely, smaller alpha can stabilize training at the cost of slower learning.\\n  We provide default settings for these parameters which we\\'ve tested with all of our models, but we encourage you to adjust them\\n  to your specific use case. Typically, one jointly changes ``lora_rank`` and ``lora_alpha`` together, where ``lora_alpha ~= 2*lora_rank``.\\n* ``lora_dropout`` introduces dropout in the LoRA layers to help regularize training. We default to 0.0 for all of our models.\\n\\nAs above, these parameters are also specified under the ``model`` flag or config entry:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"] \\\\\\n  model.lora_rank=32 \\\\\\n  model.lora_alpha=64\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n    lora_rank: 32\\n    lora_alpha: 64\\n\\n.. note::\\n\\n  To get a deeper sense of how LoRA parameters affect memory usage during training,\\n  see the :ref:`relevant section in our Llama2 LoRA tutorial<lora_tutorial_memory_tradeoff_label>`.\\n\\n.. _glossary_qlora:\\n\\nQuantized Low Rank Adaptation (QLoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`QLoRA <https://arxiv.org/abs/2305.14314>`_ is a memory enhancement on top of `LoRA <https://arxiv.org/abs/2106.09685>`_\\nthat maintains the frozen model parameters from LoRA in 4-bit quantized precision, thereby reducing memory usage.\\nThis is enabled through a novel  4-bit NormalFloat (NF4) data type proposed by the authors, which allows for 4-8x less\\nparameter memory usage whilst retaining model accuracy. You can read our tutorial on :ref:`finetuning Llama2 with QLoRA<qlora_finetune_label>`\\nfor a deeper understanding of how it works.\\n\\nWhen considering using QLoRA to reduce memory usage, it\\'s worth noting that QLoRA is slower than LoRA and may not be worth it if\\nthe model you are finetuning is small. In numbers, QLoRA saves roughly 1.5 bytes * (# of model parameters). Also, although QLoRA quantizes the model,\\nit minimizes accuracy degradation by up-casting quantized parameters to the original higher precision datatype during model forward passes - this up-casting may incur penalties to training speed.\\nThe :ref:`relevant section <qlora_compile_label>` in our QLoRA tutorial demonstrates the usage of ``torch.compile`` to address this by speeding up training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using QLoRA with any of our LoRA recipes, i.e. recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device<lora_finetune_recipe_label>`. These recipes utilize\\nQLoRA-enabled model builders, which we support for all our models, and also use the ``qlora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3_8b` model has a corresponding :func:`torchtune.models.llama3.qlora_llama3_8b`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with QLoRA quickly,\\njust specify any config with ``_qlora`` in its name.\\n\\nAll the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA <glossary_lora>`\\nto see how to configure these parameters.\\n\\nTo configure from the command line:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=32 \\\\\\n  model.lora_alpha=64\\n\\n\\nor, by modifying a config:\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.qlora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 32\\n    lora_alpha: 64\\n\\n.. _glossary_dora:\\n\\nWeight-Decomposed Low-Rank Adaptation (DoRA)\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`DoRA <https://arxiv.org/abs/2402.09353>`_ is another PEFT technique which builds on-top of LoRA by\\nfurther decomposing the pre-trained weights into two components: magnitude and direction. The magnitude component\\nis a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA decomposition and\\nupdates the orientation of weights.\\n\\nDoRA adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to\\nimprove the performance of LoRA, particularly at low ranks.\\n\\n*Sounds great! How do I use it?*\\n\\nMuch like LoRA and QLoRA, you can finetune using DoRA with any of our LoRA recipes. We use the same model builders for LoRA\\nas we do for DoRA, so you can use the ``lora_`` version of any model builder with ``use_dora=True``. For example, to finetune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n.. _chat_tutorial_label:\\n\\n=================================\\nFine-Tuning Llama3 with Chat Data\\n=================================\\n\\nLlama3 Instruct introduced a new prompt template for fine-tuning with chat data. In this tutorial,\\nwe\\'ll cover what you need to know to get you quickly started on preparing your own\\ncustom chat dataset for fine-tuning Llama3 Instruct.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:\\n\\n      * How the Llama3 Instruct format differs from Llama2\\n      * All about prompt templates and special tokens\\n      * How to use your own chat dataset to fine-tune Llama3 Instruct\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`configuring datasets<chat_dataset_usage_label>`\\n      * Know how to :ref:`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you\\'ll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_\\nthe template from Llama2 to better support multiturn conversations. The same text\\nin the Llama3 Instruct format would look like this:\\n\\n.. code-block:: text\\n\\n    <|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n    You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n    Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n    Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"content\": \"You are a helpful, respectful, and honest assistant.\",\\n        },\\n        {\\n            \"role\": \"user\",\\n            \"content\": \"Who are the most influential hip-hop artists of all time?\",\\n        },\\n        {\\n            \"role\": \"assistant\",\\n            \"content\": \"Here is a list of some of the most influential hip-hop \"\\n            \"artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\",\\n        },\\n    ]\\n\\nNow, let\\'s format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and\\nsee how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,\\nwhich simply structures a prompt with flavor text to indicate a certain task.\\n\\n.. code-block:: python\\n\\n    from torchtune.data import Llama2ChatTemplate, Message\\n\\n    messages = [Message.from_dict(msg) for msg in sample]\\n    formatted_messages = Llama2ChatTemplate.format(messages)\\n    print(formatted_messages)\\n    # [\\n    #     Message(\\n    #         role=\\'user\\',\\n    #         content=\\'[INST] <<SYS>>\\\\nYou are a helpful, respectful, and honest assistant.\\\\n<</SYS>>\\\\n\\\\nWho are the most influential hip-hop artists of all time? [/INST] \\',\\n    #         ...,\\n    #     ),\\n    #     Message(\\n    #         role=\\'assistant\\',\\n    #         content=\\'Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\\',\\n    #         ...,\\n    #     ),\\n    # ]\\n\\nThere are also special tokens used by Llama2, which are not in the prompt template.\\nIf you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you\\'ll notice that\\nwe don\\'t include the :code:`<s>` and :code:`</s>` tokens. These are the beginning-of-sequence\\n(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer\\nthan the rest of the prompt template. Let\\'s tokenize this example with the\\n:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see\\nwhy.\\n\\n.. code-block:: python\\n\\n    from torchtune.models.llama2 import llama2_tokenizer\\n\\n    tokenizer = llama2_tokenizer(\"/tmp/Llama-2-7b-hf/tokenizer.model\")\\n    user_message = formatted_messages[0].text_content\\n    tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True)\\n    print(tokens)\\n    # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2]\\n\\nWe\\'ve added the BOS and EOS tokens when encoding our example text. This shows up\\nas IDs 1 and 2. We can verify that these are our BOS and EOS tokens.\\n\\n.. code-block:: python\\n\\n    print(tokenizer._spm_model.spm_model.piece_to_id(\"<s>\"))\\n    # 1\\n    print(tokenizer._spm_model.spm_model.piece_to_id(\"</s>\"))\\n    # 2\\n\\nThe BOS and EOS tokens are what we call special tokens, because they have their own\\nreserved token IDs. This means that they will index to their own individual vectors in\\nthe model\\'s learnt embedding table. The rest of the prompt template tags, :code:`[INST]`\\nand :code:`<<SYS>>` are tokenized as normal text and not their own IDs.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.decode(518))\\n    # \\'[\\'\\n    print(tokenizer.decode(25580))\\n    # \\'INST\\'\\n    print(tokenizer.decode(29962))\\n    # \\']\\'\\n    print(tokenizer.decode([3532, 14816, 29903, 6778]))\\n    # \\'<<SYS>>\\'\\n\\nIt\\'s important to note that you should not place the special reserved tokens in your\\ninput prompts manually, as it will be treated as normal text and not as a special\\ntoken.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.encode(\"<s>\", add_bos=False, add_eos=False))\\n    # [529, 29879, 29958]\\n\\nNow let\\'s take a look at Llama3\\'s formatting to see how it\\'s tokenized differently\\nthan Llama2.\\n\\n.. code-block:: python\\n\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    messages = [Message.from_dict(msg) for msg in sample]\\n    tokens, mask = tokenizer.tokenize_messages(messages)\\n    print(tokenizer.decode(tokens))\\n    # \\'<|start_header_id|>system<|end_header_id|>\\\\n\\\\nYou are a helpful, respectful,\\n    # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\\\n\\\\nWho\\n    # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>\\n    # assistant<|end_header_id|>\\\\n\\\\nHere is a list of some of the most influential hip-hop\\n    # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>\\'\\n\\n.. note::\\n    We used the ``tokenize_messages`` API for Llama3, which is different than\\n    encode. It simply manages adding all the special tokens in the correct\\n    places after encoding the individual messages.\\n\\nWe can see that the tokenizer handled all the formatting without us specifying a prompt\\ntemplate. It turns out that all of the additional tags are special tokens, and we don\\'t require\\na separate prompt template. We can verify this by checking if the tags get encoded\\nas their own token IDs.\\n\\n.. code-block:: python\\n\\n    print(tokenizer.special_tokens[\"<|begin_of_text|>\"])\\n    # 128000\\n    print(tokenizer.special_tokens[\"<|eot_id|>\"])\\n    # 128009\\n\\nThe best part is - all these special tokens are handled purely by the tokenizer.\\nThat means you won\\'t have to worry about messing up any required prompt templates!\\n\\n\\nWhen should I use a prompt template?\\n------------------------------------\\n\\nWhether or not to use a prompt template is governed by what your desired inference\\nbehavior is. You should use a prompt template if you are running inference on the\\nbase model and it was pre-trained with a prompt template, or you want to prime a\\nfine-tuned model to expect a certain prompt structure on inference for a specific task.\\n\\nIt is not strictly necessary to fine-tune with a prompt template, but generally\\nspecific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`\\nprovides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.\\nThis would wrap around the user message, with the assistant message untouched.\\n\\n.. code-block:: python\\n\\n    f\"Summarize this dialogue:\\\\n{dialogue}\\\\n---\\\\nSummary:\\\\n\"\\n\\nYou can fine-tune Llama2 with this template even though the model was originally pre-trained\\nwith the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model\\nsees during inference. The model should be robust enough to adapt to a new template.\\n\\n\\nFine-tuning on a custom chat dataset\\n------------------------------------\\n\\nLet\\'s test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom\\nchat dataset. We\\'ll walk through how to set up our data so that it can be tokenized\\ncorrectly and fed into our model.\\n\\nLet\\'s say we have a local dataset saved as a JSON file that contains conversations\\nwith an AI model. How can we get something like this into a format\\nLlama3 understands and tokenizes correctly?\\n\\n.. code-block:: python\\n\\n    # data/my_data.json\\n    [\\n        {\\n            \"dialogue\": [\\n                {\\n                    \"from\": \"human\",\\n                    \"value\": \"What is your name?\"\\n                },\\n                {\\n                    \"from\": \"gpt\",\\n                    \"value\": \"I am an AI assistant, I don\\'t have a name.\"\\n                },\\n                {\\n                    \"from\": \"human\",\\n                    \"value\": \"Pretend you have a name.\"\\n                },\\n                {\\n                    \"from\": \"gpt\",\\n                    \"value\": \"My name is Mark Zuckerberg.\"\\n                }\\n            ]\\n        },\\n    ]\\n\\nLet\\'s first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we\\nhave conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://docs.mistral.ai/getting-started/open_weight_models/#chat-template>`_.\\n\\nNow we\\'re ready to start fine-tuning! We\\'ll use the built-in LoRA single device recipe.\\nUse the :ref:`tune cp <tune_cp_cli_label>` command to get a copy of the :code:`8B_lora_single_device.yaml`\\nconfig and update it with your dataset configuration.\\n\\nLaunch the fine-tune!\\n\\n.. code-block:: bash\\n\\n    $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15\\n\\n.. _llama3_label:\\n\\n========================\\nMeta Llama3 in torchtune\\n========================\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:\\n\\n      * Download the Llama3-8B-Instruct weights and tokenizer\\n      * Fine-tune Llama3-8B-Instruct with LoRA and QLoRA\\n      * Evaluate your fine-tuned Llama3-8B-Instruct model\\n      * Generate text with your fine-tuned model\\n      * Quantize your model to speed up generation\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n\\n\\nLlama3-8B\\n---------\\n\\n`Meta Llama 3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let\\'s download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3-8B-Instruct \\\\\\n        --output-dir <checkpoint_dir> \\\\\\n        --hf-token <ACCESS TOKEN>\\n\\n|\\n\\nFine-tuning Llama3-8B-Instruct in torchtune\\n-------------------------------------------\\n\\ntorchtune provides `LoRA <https://arxiv.org/abs/2106.09685>`_, `QLoRA <https://arxiv.org/abs/2305.14314>`_, and full fine-tuning\\nrecipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial <lora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill be saved separately.\\n\\nIn our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.\\n\\nIf you have multiple GPUs available, you can run the distributed version of the recipe.\\ntorchtune makes use of the `FSDP <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_ APIs from PyTorch Distributed\\nto shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster overall training.\\nFor example, on two devices:\\n\\n.. code-block:: bash\\n\\n    tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora\\n\\nFinally, if we want to use even less memory, we can leverage torchtune\\'s QLoRA recipe via:\\n\\n.. TODO (SalmanMohammadi) ref qlora recipe page\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_qlora_single_device\\n\\nSince our default configs enable full bfloat16 training, all of the above commands can be run with\\ndevices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory\\nbelow 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.\\nTry it out!\\n\\n|\\n\\nEvaluating fine-tuned Llama3-8B models with EleutherAI\\'s Eval Harness\\n---------------------------------------------------------------------\\n\\nNow that we\\'ve fine-tuned our model, what\\'s next? Let\\'s take our LoRA-finetuned model from the\\npreceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.\\n\\nFirst, torchtune provides an integration with\\n`EleutherAI\\'s evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_\\nfor model evaluation on common benchmark tasks.\\n\\n.. note::\\n    Make sure you\\'ve first installed the evaluation harness via :code:`pip install \"lm_eval==0.4.*\"`.\\n\\nFor this tutorial we\\'ll use the `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_ task from the harness.\\nThis task measures a model\\'s propensity to be truthful when answering questions and\\nmeasures the model\\'s zero-shot accuracy on a question followed by one or more true\\nresponses and one or more false responses. First, let\\'s copy the config so we can point the YAML\\nfile to our fine-tuned checkpoint files.\\n\\n.. code-block:: bash\\n\\n    tune cp eleuther_evaluation ./custom_eval_config.yaml\\n\\nNext, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.\\n\\n.. code-block:: yaml\\n\\n    model:\\n      _component_: torchtune.models.llama3.llama3_8b\\n\\n    checkpointer:\\n      _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n      # directory with the checkpoint files\\n      # this should match the output_dir specified during\\n      # fine-tuning\\n      checkpoint_dir: <checkpoint_dir>\\n\\n      # checkpoint files for the fine-tuned model. These will be logged\\n      # at the end of your fine-tune\\n      checkpoint_files: [\\n        meta_model_0.pt\\n      ]\\n\\n      output_dir: <checkpoint_dir>\\n      model_type: LLAMA3\\n\\n    # Make sure to update the tokenizer path to the right\\n    # checkpoint directory as well\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <checkpoint_dir>/tokenizer.model\\n\\nFinally, we can run evaluation using our modified config.\\n\\n.. code-block:: bash\\n\\n    tune run eleuther_eval --config ./custom_eval_config.yaml\\n\\nTry it for yourself and see what accuracy your model gets!\\n\\n|\\n\\nGenerating text with our fine-tuned Llama3 model\\n------------------------------------------------\\n\\n.. TODO (SalmanMohammadi) ref generate recipe page\\n\\nNext, let\\'s look at one other way we can evaluate our model: generating text! torchtune provides a\\n`recipe for generation <https://github.com/pytorch/torchtune/blob/main/recipes/generate.py>`_ as well.\\n\\nSimilar to what we did, let\\'s copy and modify the default generation config.\\n\\n.. code-block:: bash\\n\\n    tune cp generation ./custom_generation_config.yaml\\n\\nNow we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.\\n\\n.. code-block:: yaml\\n\\n    model:\\n      _component_: torchtune.models.llama3.llama3_8b\\n\\n    checkpointer:\\n      _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n      # directory with the checkpoint files\\n      # this should match the output_dir specified during\\n      # fine-tuning\\n      checkpoint_dir: <checkpoint_dir>\\n\\n      # checkpoint files for the fine-tuned model. These will be logged\\n      # at the end of your fine-tune\\n      checkpoint_files: [\\n        meta_model_0.pt\\n      ]\\n\\n      output_dir: <checkpoint_dir>\\n      model_type: LLAMA3\\n\\n    # Make sure to update the tokenizer path to the right\\n    # checkpoint directory as well\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <checkpoint_dir>/tokenizer.model\\n\\nRunning generation with our LoRA-finetuned model, we see the following output:\\n\\n.. code-block:: bash\\n\\n    tune run generate --config ./custom_generation_config.yaml \\\\\\n    prompt.user=\"Hello, my name is\"\\n\\n    [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.\\n    ...\\n    [generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec\\n    [generate.py:138] Bandwidth achieved: 346.09 GB/s\\n    [generate.py:139] Memory used: 18.31 GB\\n\\nFaster generation via quantization\\n----------------------------------\\n\\nWe rely on `torchao <https://github.com/pytorch-labs/ao>`_ for `post-training quantization <https://github.com/pytorch/ao/tree/main/torchao/quantization#quantization>`_.\\nTo quantize the fine-tuned model after installing torchao we can run the following command::\\n\\n  # we also support `int8_weight_only()` and `int8_dynamic_activation_int8_weight()`, see\\n  # https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques\\n  # for a full list of techniques that we support\\n  from torchao.quantization.quant_api import quantize_, int4_weight_only\\n  quantize_(model, int4_weight_only())\\n\\nAfter quantization, we rely on torch.compile for speedups. For more details, please see `this example usage <https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#quantization-flow-example>`_.\\n\\ntorchao also provides `this table <https://github.com/pytorch/ao#inference>`_ listing performance and accuracy results for ``llama2`` and ``llama3``.\\n\\nFor Llama models, you can run generation directly in torchao on the quantized model using their ``generate.py`` script as\\ndiscussed in `this readme <https://github.com/pytorch/ao/tree/main/torchao/_models/llama>`_. This way you can compare your own results\\nto those in the previously-linked table.\\n\\n\\nThis is just the beginning of what you can do with Meta Llama3 using torchtune and the broader ecosystem.\\nWe look forward to seeing what you build!\\n\\n404: Not Found\\n.. _qat_finetune_label:\\n\\n===========================\\nFine-Tuning Llama3 with QAT\\n===========================\\n\\nQuantization-Aware Training (QAT) is a common technique for users to quantize their\\nmodels without incurring significant degradations in accuracy or perplexity. In this\\ntutorial, we’ll walk through how to apply QAT during fine-tuning, quantize the\\nresulting model, and evaluate your quantized model using torchtune.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What QAT is and how it helps reduce quantization degradation\\n      * How to run QAT during fine-tuning in torchtune\\n      * End-to-end example of connecting QAT, quantization, and evaluation recipes\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama3-8B model weights<download_llama_label>`\\n\\n.. _what_is_qat_label:\\n\\nWhat is QAT?\\n------------\\n\\n`Quantization-Aware Training <https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training>`_ (QAT) refers to simulating quantization numerics during\\ntraining or fine-tuning, with the end goal of ultimately producing a higher quality\\nquantized model compared to simple post-training quantization (PTQ). During QAT,\\nthe weights and/or activations are “fake quantized”, meaning they are transformed\\nas if they were being quantized, but kept in the original data type (e.g. bfloat16)\\nwithout being actually cast to lower bit-widths. Thus, fake quantization allows the\\nmodel to adjust for quantization noise when updating the weights, hence the training\\nprocess is “aware” that the model will ultimately be quantized after training.\\n\\n.. code-block:: python\\n\\n  # PTQ: x_q is quantized and cast to int8\\n  # scale and zero point (zp) refer to parameters used to quantize x_float\\n  # qmin and qmax refer to the range of quantized values\\n  x_q = (x_float / scale + zp).round().clamp(qmin, qmax).cast(int8)\\n\\n  # QAT: x_fq is still in float\\n  # Fake quantize simulates the numerics of quantize + dequantize\\n  x_fq = (x_float / scale + zp).round().clamp(qmin, qmax)\\n  x_fq = (x_fq - zp) * scale\\n\\nQAT typically involves applying a transformation to your model before and after training.\\nFor example, in the `torchao QAT implementation <https://github.com/pytorch/ao/blob/v0.2.0/torchao/quantization/prototype/qat.py>`_,\\nthese are represented as the ``prepare()`` and ``convert()`` steps: (1) ``prepare()`` inserts fake quantize\\noperations into linear layers, and (2) ``convert()`` transforms the fake quantize operations\\nto actual quantize and dequantize operations after training, thereby producing a quantized\\nmodel (dequantize operations are typically fused with linear after lowering).\\nBetween these two steps, training can proceed exactly as before.\\n\\n.. image:: /_static/img/qat_diagram.png\\n\\n.. _apply_qat_label:\\n\\nApplying QAT to Llama3 models\\n-----------------------------\\n\\nWe can easily apply the above QAT transformations to Llama3 for fine-tuning,\\nleveraging the APIs in torchao as follows:\\n\\n.. code-block:: python\\n\\n  import copy\\n  import torch\\n  from torchao.quantization import quantize_\\n  from torchao.quantization.qat import (\\n      FakeQuantizeConfig,\\n      IntXQuantizationAwareTrainingConfig,\\n  )\\n  from torchtune.models.llama3 import llama3_8b\\n\\n  model = llama3_8b()\\n  original_model = copy.deepcopy(model)\\n\\n  # Config for int8 dynamic asymmetric per token activations +\\n  # int4 symmetric per group weights, only for linear layers\\n  activation_config = FakeQuantizeConfig(torch.int8, \"per_token\", is_symmetric=False)\\n  weight_config = FakeQuantizeConfig(torch.int4, group_size=32)\\n  qat_config = IntXQuantizationAwareTrainingConfig(activation_config, weight_config)\\n\\n  # Prepare the model for quantization-aware fine-tuning.\\n  #\\n  # This step inserts \"fake quantize\" ops that simulate\\n  # quantization numerics during fine-tuning without\\n  # actually casting the activations/weights to lower-bit\\n  # dtypes like in \"real\" quantization.\\n  quantize_(model, qat_config)\\n\\n  prepared_model = model\\n\\nThe model is now ready for QAT fine-tuning! If we print the model we’ll see that\\nall linear layers have been swapped with :code:`FakeQuantizedLinear`, which simulates\\nthe numerics of int8 dynamic asymmetric per token activations + int4 symmetric\\nper group weights:\\n\\n.. code-block:: bash\\n\\n  >>> original_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n.. code-block:: bash\\n\\n  >>> prepared_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=4096, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (k_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=1024, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (v_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=1024, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (output_proj): FakeQuantizedLinear(\\n      in_features=4096, out_features=4096, bias=False\\n      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))\\n    )\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\nAfter fine-tuning, we can convert the model to get an actual quantized model:\\n\\n.. code-block:: python\\n\\n  from torchao.quantization.qat import (\\n      FromIntXQuantizationAwareTrainingConfig,\\n  )\\n  from torchao.quantization import (\\n      Int8DynamicActivationInt4WeightConfig,\\n  )\\n\\n  # Fine-tune as before\\n  train_loop(prepared_model)\\n\\n  # Convert the fake quantized model into an actual quantized model\\n  #\\n  # First, we swap `FakeQuantizedLinear` back to `torch.nn.Linear`\\n  # while keeping the QAT fine-tuned weights. Then, we perform standard\\n  # post-training quantization (PTQ), which inserts quantized activation\\n  # and weight tensor subclasses\\n  quantize_(prepared_model, FromIntXQuantizationAwareTrainingConfig())\\n  quantize_(prepared_model, Int8DynamicActivationInt4WeightConfig(group_size=32))\\n\\n  converted_model = prepared_model\\n\\nThe model is now fully quantized to int8 and int4 and ready for inference\\nor generation. If we print the model now, we will see the linear layers\\nare now swapped back to :code:`torch.nn.Linear`, but with quantized tensor\\nactivations and weights:\\n\\n.. code-block:: bash\\n\\n  >>> converted_model.layers[0].attn\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (k_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (v_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (output_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n\\nQAT finetuning recipe in torchtune\\n----------------------------------\\n\\nPutting it all together, we can now fine-tune a model using torchtune’s :ref:`QAT recipe<qat_distributed_recipe_label>`.\\nMake sure that you have first downloaded the Llama3 weights and tokenizer by\\nfollowing :ref:`these instructions<download_llama_label>`. In this tutorial,\\nwe use the following settings to demonstrate QAT’s effectiveness in recovering\\nquantization degradation compared to directly quantizing a model fine-tuned\\nwithout QAT. You can copy the default QAT config and make the following\\nmodifications accordingly:\\n\\n.. code-block:: bash\\n\\n  tune cp llama3/8B_qat_full custom_8B_qat_full.yaml\\n\\n.. code-block:: yaml\\n\\n  dataset:\\n    _component_: torchtune.datasets.text_completion_dataset\\n    source: allenai/c4\\n    column: text\\n    name: en\\n    split: train\\n\\n  ...\\n\\n  epochs: 1\\n  max_steps_per_epoch: 2000\\n  fake_quant_after_n_steps: 1000\\n\\nBy default, this uses the :code:`torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer`,\\nwhich uses the same fake quantization configurations as the example above.\\n\\nEmpirically, we observed that disabling fake quantization for the first N steps\\nled to better results, presumably because doing so allows the weights to stabilize\\nbefore we start introducing quantization noise to the fine-tuning process.\\nFor this reason, here we disable fake quantization for the first 1000 steps.\\n\\nYou can then use the following command to run fine-tuning with QAT using the above\\nconfig. This workload requires at least 6 GPUs, each with VRAM of at least 80GB.\\nBy default, this uses the int8 dynamic per token activations + int4 grouped per\\nchannel weights quantization configuration as shown above:\\n\\n.. code-block:: bash\\n\\n  tune run --nnodes 1 --nproc_per_node 6 qat_distributed --config custom_8B_qat_full.yaml\\n\\n.. note::\\n\\n  Make sure to point to the location of your Llama3 weights and tokenizer. This can be done\\n  either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n  or by directly modifying the :code:`8B_qat_full.yaml` file. See our :ref:`config_tutorial_label`\\n  for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n\\n  QAT introduces memory and computation overheads compared to regular fine-tuning,\\n  since fake quantization fundamentally involves extra ops and requires cloning\\n  the weights to avoid mutating them when computing the fake quantized values.\\n  In general, we expect around 30% decrease in fine-tuning speed for models like\\n  Llama3-8B. With activation checkpointing, the increase in memory footprint per\\n  GPU is minimal (< 5GB per GPU).\\n\\n\\nQuantizing the QAT model\\n------------------------\\n\\nNote that the QAT recipe above produces an unquantized bfloat16 model. The model\\nstructure is exactly the same as the model produced with regular full fine-tuning\\nwithout QAT, just with different weights. To actually get a quantized model,\\ncopy and make the following modifications to the quantization config:\\n\\n.. code-block:: bash\\n\\n  tune cp quantization custom_quantization.yaml\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.llama3_8b\\n\\n  checkpointer:\\n    _component_: torchtune.training.FullModelMetaCheckpointer\\n    checkpoint_dir: <your QAT checkpoint dir>\\n    checkpoint_files: [ft-model-00001-of-00001.bin]\\n    output_dir: <your QAT checkpoint dir>\\n    model_type: LLAMA3\\n\\n  ...\\n\\n  quantizer:\\n    _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer\\n    groupsize: 256\\n\\nThe following command performs the convert step in the QAT flow, which actually\\nquantizes the float model to a model with quantized weights:\\n\\n.. code-block:: bash\\n\\n  tune run quantize --config custom_quantization.yaml\\n\\n.. note::\\n\\n  Make sure to use the same QAT quantizer you used to fine-tune your model,\\n  otherwise the numerics will be off and the quantized model will perform poorly.\\n\\n.. _qat_eval_label:\\n\\nEvaluating the quantized model\\n------------------------------\\n\\nNow that we have a quantized model, we can run some evaluations on it and compare the\\nresults against regular fine-tuning without QAT (i.e. post-training quantization).\\nTo achieve this, we use `EleutherAI’s evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_\\nintegrated in torchtune. First, copy the evaluation config and make the following changes:\\n\\n.. code-block:: bash\\n\\n  tune cp eleuther_evaluation custom_eleuther_evaluation.yaml\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.llama3_8b\\n\\n  checkpointer:\\n    _component_: torchtune.training.FullModelTorchTuneCheckpointer\\n    checkpoint_dir: <your quantized model checkpoint dir>\\n    checkpoint_files: [ft-model-00001-of-00001-8da4w.bin]\\n    output_dir: <your quantized model checkpoint dir>\\n    model_type: LLAMA3\\n\\n  ...\\n\\n  tasks: [\"hellaswag\", \"wikitext\"]\\n\\n  quantizer:\\n    _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer\\n    groupsize: 256\\n\\n.. note::\\n\\n  Since we are passing in a quantized model, be sure to use the corresponding\\n  post-training quantizer instead of the QAT quantizer. For example, if you\\n  used the :code:`Int8DynActInt4WeightQATQuantizer` during fine-tuning, you\\n  should specify :code:`Int8DynActInt4WeightQuantizer` in this step. See the\\n  `quantization recipe <https://github.com/pytorch/torchtune/blob/main/recipes/quantize.py>`_\\n  for a full list of supported quantizers.\\n\\nNow run the evaluation recipe:\\n\\n.. code-block:: bash\\n\\n  tune run eleuther_eval --config my_eleuther_evaluation.yaml\\n\\nThe results should look something like this:\\n\\n.. code-block:: bash\\n\\n  # QAT quantized model evaluation results (int8 activations + int4 weights)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     |Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|-----:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|9.9148|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity|1.5357|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  |0.6189|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            |0.5687|±  |0.0049|\\n  |         |       |none  |     0|acc_norm       |0.7536|±  |0.0043|\\n\\nComparing these results to the model fine-tuned without QAT, we can see that\\nQAT was able to recover a significant portion of the quantization degradations\\nfrom the original unquantized model compared to PTQ. For example, normalized\\naccuracy in the hellaswag task dropped by 2.20% with PTQ but only 0.74% with\\nQAT when compared to the original unquantized model. Similarly, word perplexity\\nin the wikitext task increased by 2.048 with PTQ but only 1.190 with QAT (lower\\nis better).\\n\\n.. code-block:: bash\\n\\n  # PTQ quantized model evaluation results (int8 activations + int4 weights)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     | Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|------:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|10.7735|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity| 1.5598|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  | 0.6413|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            | 0.5481|±  |0.0050|\\n  |         |       |none  |     0|acc_norm       | 0.7390|±  |0.0044|\\n\\n.. code-block:: bash\\n\\n  # Float model evaluation results (bfloat16)\\n\\n  |  Tasks  |Version|Filter|n-shot|    Metric     |Value |   |Stderr|\\n  |---------|------:|------|-----:|---------------|-----:|---|------|\\n  |wikitext |      2|none  |     0|word_perplexity|8.7251|±  |N/A   |\\n  |         |       |none  |     0|byte_perplexity|1.4994|±  |N/A   |\\n  |         |       |none  |     0|bits_per_byte  |0.5844|±  |N/A   |\\n  |hellaswag|      1|none  |     0|acc            |0.5740|±  |0.0049|\\n  |         |       |none  |     0|acc_norm       |0.7610|±  |0.0043|\\n\\nThus, the QAT flow produced a quantized model that outperforms the post-training\\nquantized model. Importantly, the quantized model structure is identical in both\\nflows, and so the model size, memory usage, and all other performance\\ncharacteristics are also the same.\\n\\nNote that although the weights are quantized to int4, the quantized model size\\nfor both the QAT and the PTQ flows are 8.187 GB, while the original float model\\nis 14.958 GB. This is because this quantizer uses int8 to represent the weights\\nas PyTorch does not have native int4 dtype support. A more efficient representation\\nis to pack the int4 weights, which will halve the quantized model size. This is\\nwhat the Int4WeightOnlyQuantizer does, and the corresponding QAT quantizer will\\nbe added in the future.\\n\\nLowering QAT model to device (optional)\\n---------------------------------------\\n\\nOne important motivation for quantizing a model is to be able to run it in resource\\nconstrained environments. You can further lower your QAT Llama3 model to edge devices\\nsuch as smartphones using `executorch <https://github.com/pytorch/executorch/>`_ by\\nfollowing `these instructions <https://github.com/pytorch/executorch/tree/main/examples/models/llama2>`_.\\nFor example, the following command lowers the model to the XNNPACK backend:\\n\\n.. code-block:: bash\\n\\n  python -m examples.models.llama2.export_llama --checkpoint <your QAT checkpoint> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 256 -d fp32 --metadata \\'{\"get_bos_id\":128000, \"get_eos_id\":128001}\\' --embedding-quantize 4,32 --output_name=\"llama3_8da4w.pte\"\\n\\nThis results in a much smaller quantized model of size 3.881 GB. When benchmarked on a OnePlus 12 smartphone, this model also achieved the same inference and generation speeds as the post-training quantized model. This is because the model structures are the same across the two flows:\\n\\n.. list-table::\\n   :widths: 25 25 25\\n   :header-rows: 1\\n\\n   * -\\n     - QAT\\n     - PTQ\\n   * - Quantized model size\\n     - 3.881 GB\\n     - 3.881 GB\\n   * - Inference speed\\n     - 9.709 tok/s\\n     - 9.815 tok/s\\n   * - Generation speed\\n     - 11.316 tok/s\\n     - 11.364 tok/s\\n\\n.. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network\\'s remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer\\'s self-attention.\\n\\n.. note::\\n\\n    If you\\'re unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_,\\nyou can expect to see further memory savings from the optimizer state.\\n\\n.. note::\\n\\n    LoRA memory savings come primarily from gradient and optimizer states,\\n    so if your model\\'s peak memory comes in its :code:`forward()` method, then LoRA\\n    may not reduce peak memory.\\n\\nHow does LoRA work?\\n-------------------\\n\\nLoRA replaces weight update matrices with a low-rank approximation. In general, weight updates\\nfor an arbitrary :code:`nn.Linear(in_dim,out_dim)` layer could have rank as high as\\n:code:`min(in_dim,out_dim)`. LoRA (and other related papers such as `Aghajanyan et al. <https://arxiv.org/abs/2012.13255>`_)\\nhypothesize that the `intrinsic dimension <https://en.wikipedia.org/wiki/Intrinsic_dimension>`_\\nof these updates during LLM fine-tuning can in fact be much lower.\\nTo take advantage of this property, LoRA finetuning will freeze the original model,\\nthen add a trainable weight update from a low-rank projection. More explicitly, LoRA trains two\\nmatrices :code:`A` and :code:`B`. :code:`A` projects the inputs down to a much smaller rank (often four or eight in practice), and\\n:code:`B` projects back up to the dimension output by the original linear layer.\\n\\nThe image below gives a simplified representation of a single weight update step from a full finetune\\n(on the left) compared to a weight update step with LoRA (on the right). The LoRA matrices :code:`A` and :code:`B`\\nserve as an approximation to the full rank weight update in blue.\\n\\n.. image:: /_static/img/lora_diagram.png\\n\\nAlthough LoRA introduces a few extra parameters in the model :code:`forward()`, only the :code:`A` and :code:`B` matrices are trainable.\\nThis means that with a rank :code:`r` LoRA decomposition, the number of gradients we need to store reduces\\nfrom :code:`in_dim*out_dim` to :code:`r*(in_dim+out_dim)`. (Remember that in general :code:`r`\\nis much smaller than :code:`in_dim` and :code:`out_dim`.)\\n\\nFor example, in the 7B Llama2\\'s self-attention, :code:`in_dim=out_dim=4096` for the Q, K,\\nand V projections. This means a LoRA decomposition of rank :code:`r=8` will reduce the number of trainable\\nparameters for a given projection from :math:`4096 * 4096 \\\\approx 15M` to :math:`8 * 8192 \\\\approx 65K`, a\\nreduction of over 99%.\\n\\nLet\\'s take a look at a minimal implementation of LoRA in native PyTorch.\\n\\n\\n.. code-block:: python\\n\\n  import torch\\n  from torch import nn\\n\\n  class LoRALinear(nn.Module):\\n    def __init__(\\n      self,\\n      in_dim: int,\\n      out_dim: int,\\n      rank: int,\\n      alpha: float,\\n      dropout: float\\n    ):\\n      # These are the weights from the original pretrained model\\n      self.linear = nn.Linear(in_dim, out_dim, bias=False)\\n\\n      # These are the new LoRA params. In general rank << in_dim, out_dim\\n      self.lora_a = nn.Linear(in_dim, rank, bias=False)\\n      self.lora_b = nn.Linear(rank, out_dim, bias=False)\\n\\n      # Rank and alpha are commonly-tuned hyperparameters\\n      self.rank = rank\\n      self.alpha = alpha\\n\\n      # Most implementations also include some dropout\\n      self.dropout = nn.Dropout(p=dropout)\\n\\n      # The original params are frozen, and only LoRA params are trainable.\\n      self.linear.weight.requires_grad = False\\n      self.lora_a.weight.requires_grad = True\\n      self.lora_b.weight.requires_grad = True\\n\\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\\n      # This would be the output of the original model\\n      frozen_out = self.linear(x)\\n\\n      # lora_a projects inputs down to the much smaller self.rank,\\n      # then lora_b projects back up to the output dimension\\n      lora_out = self.lora_b(self.lora_a(self.dropout(x)))\\n\\n      # Finally, scale by the alpha parameter (normalized by rank)\\n      # and add to the original model\\'s outputs\\n      return frozen_out + (self.alpha / self.rank) * lora_out\\n\\nThere are some other details around initialization which we omit here, but if you\\'d like to know more\\nyou can see our implementation in :class:`~torchtune.modules.peft.LoRALinear`.\\nNow that we understand what LoRA is doing, let\\'s look at how we can apply it to our favorite models.\\n\\nApplying LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n      (lora_a): Linear(in_features=4096, out_features=8, bias=False)\\n      (lora_b): Linear(in_features=8, out_features=4096, bias=False)\\n    )\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n      (lora_a): Linear(in_features=4096, out_features=8, bias=False)\\n      (lora_b): Linear(in_features=8, out_features=4096, bias=False)\\n    )\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n\\nNotice that our LoRA model\\'s layer contains additional weights in the Q and V projections,\\nas expected. Additionally, inspecting the type of :code:`lora_model` and\\n:code:`base_model`, would show that they are both instances of the same :class:`~torchtune.modules.TransformerDecoder`.\\n(Feel free to verify this for yourself.)\\n\\nWhy does this matter? torchtune makes it easy to load checkpoints for LoRA directly from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#bfloat16_floating-point_format>`_\\nfloating-point format. This can be done via the command:\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama2/7B_lora_single_device\\n\\nOn a single device, we may need to be more cognizant of our peak memory. Let\\'s run a few experiments\\nto see our peak memory during a finetune. We will experiment along two axes:\\nfirst, which model layers have LoRA applied, and second, the rank of each LoRA layer. (We will scale\\nalpha in parallel to LoRA rank, as discussed above.)\\n\\nTo compare the results of our experiments, we can evaluate our models on `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_, a task from\\nthe `TruthfulQA <https://arxiv.org/abs/2109.07958>`_ benchmark for language models. For more details on how to run this and other evaluation tasks\\nwith torchtune\\'s EleutherAI evaluation harness integration, see our :ref:`End-to-End Workflow Tutorial <eval_harness_label>`.\\n\\nPreviously, we only enabled LoRA for the linear layers in each self-attention module, but in fact there are other linear\\nlayers we can apply LoRA to: MLP layers and our model\\'s final output projection. Note that for Llama-2-7B the final output\\nprojection maps to the vocabulary dimension (32000 instead of 4096 as in the other linear layers), so enabling LoRA for this layer will increase\\nour peak memory a bit more than the other layers. We can make the following changes to our config:\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'k_proj\\', \\'v_proj\\', \\'output_proj\\']\\n    apply_lora_to_mlp: True\\n    apply_lora_to_output: True\\n  ...\\n\\n.. note::\\n    All the finetuning runs below use the `llama2/7B_lora_single_device <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama2/7B_lora_single_device.yaml>`_\\n    config, which has a default batch size of 2. Modifying the batch size (or other hyperparameters, e.g. the optimizer) will impact both peak memory\\n    and final evaluation results.\\n\\n.. list-table::\\n   :widths: 25 25 25 25 25\\n   :header-rows: 1\\n\\n   * - LoRA Layers\\n     - Rank\\n     - Alpha\\n     - Peak Memory\\n     - Accuracy (truthfulqa_mc2)\\n   * - Q and V only\\n     - 8\\n     - 16\\n     - **15.57 GB**\\n     - 0.475\\n   * - all layers\\n     - 8\\n     - 16\\n     - 15.87 GB\\n     - 0.508\\n   * - Q and V only\\n     - 64\\n     - 128\\n     - 15.86 GB\\n     - 0.504\\n   * - all layers\\n     - 64\\n     - 128\\n     - 17.04 GB\\n     - **0.514**\\n\\nWe can see that our baseline settings give the lowest peak memory, but our evaluation performance is relatively lower.\\nBy enabling LoRA for all linear layers and increasing the rank to 64, we see almost a 4% absolute improvement\\nin our accuracy on this task, but our peak memory also increases by about 1.4GB. These are just a couple simple\\nexperiments; we encourage you to run your own finetunes to find the right tradeoff for your particular setup.\\n\\nAdditionally, if you want to decrease your model\\'s peak memory even further (and still potentially achieve similar\\nmodel quality results), you can check out our :ref:`QLoRA tutorial<qlora_finetune_label>`.\\n'\n",
-       "│   │   │   │   )\n",
-       "│   │   │   ],\n",
-       "│   │   │   output_message=CompletionMessage(\n",
-       "│   │   │   │   content='Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision). The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.',\n",
-       "│   │   │   │   role='assistant',\n",
-       "│   │   │   │   stop_reason='end_of_turn',\n",
-       "│   │   │   │   tool_calls=[]\n",
-       "│   │   │   ),\n",
-       "│   │   │   session_id='6910f07f-f8e0-407b-8441-60a90e7b1834',\n",
-       "│   │   │   started_at=datetime.datetime(2025, 3, 22, 19, 29, 16, 883581, tzinfo=datetime.timezone.utc),\n",
-       "│   │   │   steps=[\n",
-       "│   │   │   │   InferenceStep(\n",
-       "│   │   │   │   │   api_model_response=CompletionMessage(\n",
-       "│   │   │   │   │   │   content='Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision). The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.',\n",
-       "│   │   │   │   │   │   role='assistant',\n",
-       "│   │   │   │   │   │   stop_reason='end_of_turn',\n",
-       "│   │   │   │   │   │   tool_calls=[]\n",
-       "│   │   │   │   │   ),\n",
-       "│   │   │   │   │   step_id='49409ea3-4a4d-4433-aa71-e6e4ec1bb054',\n",
-       "│   │   │   │   │   step_type='inference',\n",
-       "│   │   │   │   │   turn_id='212541bc-0cfa-4f04-a8a5-25fe2892bc8f',\n",
-       "│   │   │   │   │   completed_at=datetime.datetime(2025, 3, 22, 19, 29, 19, 144218, tzinfo=TzInfo(UTC)),\n",
-       "│   │   │   │   │   started_at=datetime.datetime(2025, 3, 22, 19, 29, 17, 267803, tzinfo=TzInfo(UTC))\n",
-       "│   │   │   │   )\n",
-       "│   │   │   ],\n",
-       "│   │   │   turn_id='212541bc-0cfa-4f04-a8a5-25fe2892bc8f',\n",
-       "│   │   │   completed_at=datetime.datetime(2025, 3, 22, 19, 29, 19, 155387, tzinfo=TzInfo(UTC)),\n",
-       "│   │   │   output_attachments=[]\n",
-       "│   │   )\n",
-       "]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSession\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[33msession_id\u001b[0m=\u001b[32m'6910f07f-f8e0-407b-8441-60a90e7b1834'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33msession_name\u001b[0m=\u001b[32m'simple_session_fd65775f-25c4-465b-86e9-7cee68d15130'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m16\u001b[0m, \u001b[1;36m862958\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[35mdatetime\u001b[0m.timezone.utc\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mturns\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mTurn\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33minput_messages\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;35mUserMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'What precision formats does torchtune support?'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'user'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcontext\u001b[0m=\u001b[32m'.. _memory_optimization_overview_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\nMemory Optimization Overview\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\n\\n**Author**: `Salman Mohammadi \u001b[0m\u001b[32m<\u001b[0m\u001b[32mhttps:\u001b[0m\u001b[32m//github.com/SalmanMohammadi>`_\\n\\ntorchtune comes with a host of plug-and-play memory optimization components which give you lots of flexibility\\nto ``tune`` our recipes to your hardware. This page provides a brief glossary of these components and how you might use them.\\nTo make things easy, we\\'ve summarized these components in the following table:\\n\\n.. csv-table:: Memory optimization components\\n :header: \"Component\", \"When to use?\"\\n :widths: auto\\n\\n \":ref:`glossary_precision`\", \"You\\'ll usually want to leave this as its default ``bfloat16``. It uses 2 bytes per model parameter instead of 4 bytes when using ``float32``.\"\\n \":ref:`glossary_act_ckpt`\", \"Use when you\\'re memory constrained and want to use a larger model, batch size or context length. Be aware that it will slow down training speed.\"\\n \":ref:`glossary_act_off`\", \"Similar to activation checkpointing, this can be used when memory constrained, but may decrease training speed. This **should** be used alongside activation checkpointing.\"\\n \":ref:`glossary_grad_accm`\", \"Helpful when memory-constrained to simulate larger batch sizes. Not compatible with optimizer in backward. Use it when you can already fit at least one sample without OOMing, but not enough of them.\"\\n \":ref:`glossary_low_precision_opt`\", \"Use when you want to reduce the size of the optimizer state. This is relevant when training large models and using optimizers with momentum, like Adam. Note that lower precision optimizers may reduce training stability/accuracy.\"\\n \":ref:`glossary_opt_in_bwd`\", \"Use it when you have large gradients and can fit a large enough batch size, since this is not compatible with ``gradient_accumulation_steps``.\"\\n \":ref:`glossary_cpu_offload`\", \"Offloads optimizer states and \u001b[0m\u001b[32m(\u001b[0m\u001b[32moptionally\u001b[0m\u001b[32m)\u001b[0m\u001b[32m gradients to CPU, and performs optimizer steps on CPU. This can be used to significantly reduce GPU memory usage at the cost of CPU RAM and training speed. Prioritize using it only if the other techniques are not enough.\"\\n \":ref:`glossary_lora`\", \"When you want to significantly reduce the number of trainable parameters, saving gradient and optimizer memory during training, and significantly speeding up training. This may reduce training accuracy\"\\n \":ref:`glossary_qlora`\", \"When you are training a large model, since quantization will save 1.5 bytes * \u001b[0m\u001b[32m(\u001b[0m\u001b[32m# of model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, at the potential cost of some training speed and accuracy.\"\\n \":ref:`glossary_dora`\", \"a variant of LoRA that may improve model performance at the cost of slightly more memory.\"\\n\\n\\n.. note::\\n\\n In its current state, this tutorial is focused on single-device optimizations. Check in soon as we update this page\\n for the latest memory optimization features for distributed fine-tuning.\\n\\n.. _glossary_precision:\\n\\n\\nModel Precision\\n---------------\\n\\n*What\\'s going on here?*\\n\\nWe use the term \"precision\" to refer to the underlying data type used to represent the model and optimizer parameters.\\nWe support two data types in torchtune:\\n\\n.. note::\\n\\n We recommend diving into Sebastian Raschka\\'s `blogpost on mixed-precision techniques `_\\n for a deeper understanding of concepts around precision and data formats.\\n\\n* ``fp32``, commonly referred to as \"full-precision\", uses 4 bytes per model and optimizer parameter.\\n* ``bfloat16``, referred to as \"half-precision\", uses 2 bytes per model and optimizer parameter - effectively half\\n the memory of ``fp32``, and also improves training speed. Generally, if your hardware supports training with ``bfloat16``,\\n we recommend using it - this is the default setting for our recipes.\\n\\n.. note::\\n\\n Another common paradigm is \"mixed-precision\" training: where model weights are in ``bfloat16`` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mor ``fp16``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and optimizer\\n states are in ``fp32``. Currently, we don\\'t support mixed-precision training in torchtune.\\n\\n*Sounds great! How do I use it?*\\n\\nSimply use the ``dtype`` flag or config entry in all our recipes! For example, to use half-precision training in ``bf16``,\\nset ``\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mbf16\u001b[0m\u001b[32m``.\\n\\n.. _glossary_act_ckpt:\\n\\nActivation Checkpointing\\n------------------------\\n\\n*What\\'s going on here?*\\n\\nThe relevant section in the `PyTorch documentation `_ explains this concept well.\\nTo quote:\\n\\n Activation checkpointing is a technique that trades compute for memory.\\n Instead of keeping tensors needed for backward alive until they are used in\\n gradient computation during backward, forward computation in checkpointed\\n regions omits saving tensors for backward and recomputes them during the backward pass.\\n\\nThis setting is helpful for when you\\'re memory-constrained, especially due to larger batch sizes or longer context lengths.\\nHowever, these savings in memory come at the cost of training speed \u001b[0m\u001b[32m(\u001b[0m\u001b[32mi.e. tokens-per-second\u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\nand in most cases training can slow down quite a bit as a result of this activation recomputation.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation checkpointing, use ``\u001b[0m\u001b[32menable_activation_checkpointing\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_act_off:\\n\\nActivation Offloading\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nYou may have just read about activation checkpointing! Similar to checkpointing, offloading is a memory\\nefficiency technique that allows saving GPU VRAM by temporarily moving activations to CPU and bringing\\nthem back when needed in the backward pass.\\n\\nSee `PyTorch autograd hook tutorial `_\\nfor more details about how this is implemented through :func:`torch.autograd.graph.saved_tensors_hooks`.\\n\\nThis setting is especially helpful for larger batch sizes, or longer context lengths when you\\'re memory constrained.\\nWhile of course it takes runtime and resources to move Tensors from GPU to CPU and back, the implementation in\\ntorchtune uses multiple CUDA streams \u001b[0m\u001b[32m(\u001b[0m\u001b[32mwhen available\u001b[0m\u001b[32m)\u001b[0m\u001b[32m in order to overlap the extra communication with the computation\\nto hide the extra runtime. As the communication workload is variable depending on the number and size of tensors being\\noffloaded, we do not recommend using it unless :ref:`glossary_act_ckpt` is also enabled, in which case only the checkpointed\\ntensors will be offloaded.\\n\\n*Sounds great! How do I use it?*\\n\\nTo enable activation offloading, use the ``enable_activation_offloading`` config entry or flag\\nin our lora finetuning single device recipe, e.g. ``\u001b[0m\u001b[32menable_activation_offloading\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``. To allow\\nusage of streams, make sure you are on a torch version equal to or later than PyTorch.\\n\\n.. _glossary_grad_accm:\\n\\nGradient Accumulation\\n---------------------\\n\\n*What\\'s going on here?*\\n\\nGradient accumulation allows you to simulate large batch sizes by *accumulating* gradients over several\\nbatches before updating model parameters using the optimizer. Concretely, the total number of samples used\\nfor a gradient update is when using gradient accumulation is:\\n\\n ``total_batch_size = batch_size * gradient_accumulation_steps``\\n\\nFor example: with ``\u001b[0m\u001b[32mbatch_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1\u001b[0m\u001b[32m`` and ``\u001b[0m\u001b[32mgradient_accumulation_steps\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m`` we get a total batch size of 32.\\n\\n.. note::\\n\\n For other components in torchtune which use \"steps\", such as :ref:`metric logging `, or\\n :func:`learning rate schedulers `, a \"step\" is counted as a\\n single update to model parameters, rather than a single model forward pass with the data.\\n Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``.\\n Metrics would be logged every 10 global steps, which translates to every 40 model forward passes.\\n For this reason, metric logging will appear less frequently when training with gradient accumulation,\\n and progress bars may update more slowly.\\n\\n\\nIf you\\'re using one of our distributed recipes, simply multiply by the number of devices:\\n\\n ``total_batch_size = batch_size * gradient_accumulation_steps * num_devices``\\n\\nGradient accumulation is especially useful when you can fit at least one sample in your GPU. In this case, artificially increasing the batch by\\naccumulating gradients might give you faster training speeds than using other memory optimization techniques that trade-off memory for speed, like :ref:`activation checkpointing `.\\n\\n*Sounds great! How do I use it?*\\n\\nAll of our finetuning recipes support simulating larger batch sizes by accumulating gradients. Just set the\\n``gradient_accumulation_steps`` flag or config entry.\\n\\n.. note::\\n\\n Gradient accumulation should always be set to 1 when :ref:`fusing the optimizer step into the backward pass `.\\n\\nOptimizers\\n----------\\n\\n.. _glossary_low_precision_opt:\\n\\nLower Precision Optimizers\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nIn addition to :ref:`reducing model and optimizer precision ` during training, we can further reduce precision in our optimizer states.\\nAll of our recipes support lower-precision optimizers from the `torchao `_ library.\\nFor single device recipes, we also support `bitsandbytes `_.\\n\\nA good place to start might be the :class:`torchao.prototype.low_bit_optim.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.\\nBoth reduce memory by quantizing the optimizer state dict. Paged optimizers will also offload to CPU if there isn\\'t enough GPU memory available. In practice,\\nyou can expect higher memory savings from bnb\\'s PagedAdamW8bit but higher training speed from torchao\\'s AdamW8bit.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this in your recipes, make sure you have installed torchao \u001b[0m\u001b[32m(\u001b[0m\u001b[32m``pip install torchao``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m or bitsandbytes \u001b[0m\u001b[32m(\u001b[0m\u001b[32m``pip install bitsandbytes``\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Then, enable\\na low precision optimizer using the :ref:`cli_label`:\\n\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorchao\u001b[0m\u001b[32m.prototype.low_bit_optim.AdamW8bit\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mbitsandbytes\u001b[0m\u001b[32m.optim.PagedAdamW8bit\\n\\nor by directly :ref:`modifying a config file`:\\n\\n.. code-block:: yaml\\n\\n optimizer:\\n _component_: bitsandbytes.optim.PagedAdamW8bit\\n lr: 2e-5\\n\\n.. _glossary_opt_in_bwd:\\n\\nFusing Optimizer Step into Backward Pass\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nStateful optimizers \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. optimizers which use momentum\u001b[0m\u001b[32m)\u001b[0m\u001b[32m are the default in modern deep learning due to their stable convergence properties.\\nHowever, maintaining a state of gradient statistics comes at the cost of additional memory usage. An immediate alternative might be to\\nturn to stateless optimizers such as `stochastic gradient descent `_\\nwithout momentum, which don\\'t require any additional memory usage, but will likely result in worse convergence during training.\\n\\nCan we find a middle ground here? Let\\'s consider a technique which enables the use of \"stateful\" optimizers such as `AdamW `_\\nwithout the memory overhead of gradient statistics, and without sacrificing their desirable convergence properties.\\nHow is this possible, you might ask? By *completely removing the buffer of gradients* which are stored by the optimizer during its ``step\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m``.\\n\\nTo understand how this works, we encourage you to read through the relevant PyTorch tutorial on this concept:\\n`How to save memory by fusing the optimizer step into the backward pass `_.\\n\\n\\n*Sounds great! How do I use it?*\\n\\n.. todo ref full finetune recipe doc\\n\\nIn torchtune, you can enable this feature using the ``optimizer_in_bwd`` flag. This feature works best when using a stateful optimizer\\nwith a model with a lot of parameters, and when you don\\'t need to use :ref:`gradient accumulation `.\\nYou won\\'t see meaningful impact when finetuning LoRA recipes, since in this case the number of parameters being updated are small.\\n\\n.. _glossary_cpu_offload:\\n\\nOffloading Optimizer/Gradient states to CPU\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\nWe\\'ve mentioned above the concept of optimizer states - memory used by the stateful optimizers to maintain a state of gradient statistics, and\\nmodel gradients - tensors used to store gradients when we perform model backwards passes. We support using CPU offloading in our single-device recipes\\nthrough the `CPUOffloadOptimizer `_ from ``torchao``.\\n\\nThis optimizer can wrap any base optimizer and works by keeping the optimizer states and performing the optimizer step on CPU, thus reducing\\nGPU memory usage by the size of the optimizer states. Additionally, we can also offload gradients to the CPU by using `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\nIf finetuning on a single-device, another option is to use the ``PagedAdamW8bit`` from bitsandbytes, mentioned :ref:`above `, which will *only* offload to CPU\\nwhen there is not enough GPU available.\\n\\n*Sounds great! How do I use it?*\\n\\nTo use this optimizer in your recipes, set the ``optimizer`` key in your config to :class:`torchao.prototype.low_bit_optim.CPUOffloadOptimizer`, which\\nwill use the :class:`torch.optim.AdamW` optimizer with ``\u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n tune run --config \\\\\\n \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n optimizer.\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4e\u001b[0m\u001b[32m-5\\n\\n\\nor by directly :ref:`modifying a config file`:\\n\\n.. code-block:: yaml\\n\\n optimizer:\\n _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n offload_gradients: True\\n # additional key-word arguments can be passed to torch.optim.AdamW\\n lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, # your model here\\n Adam,\\n \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1e\u001b[0m\u001b[32m-5,\\n \u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page `_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m give GPU more work per optimizer step to amortize the offloading time \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. larger batch size with activation checkpointing, gradient accumulation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n* Gradient accumulation should always be set to 1 when ``\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``\u001b[0m\u001b[32mfsdp_cpu_offload\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 `_ to see how they differ.\\n\\n\\n.. _glossary_peft:\\n\\nParameter Efficient Fine-Tuning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPEFT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n--------------------------------------\\n\\n.. _glossary_lora:\\n\\nLow Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n*What\\'s going on here?*\\n\\nYou can read our tutorial on :ref:`finetuning Llama2 with LoRA` to understand how LoRA works, and how to use it.\\nSimply stated, LoRA greatly reduces the number of trainable parameters, thus saving significant gradient and optimizer\\nmemory during training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using any of our recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device`. These recipes utilize\\nLoRA-enabled model builders, which we support for all our models, and also use the ``lora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3` model has a corresponding :func:`torchtune.models.llama3.lora_llama3`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List\u001b[0m\u001b[32m[\u001b[0m\u001b[32mstr\u001b[0m\u001b[32m]\u001b[0m\u001b[32m`` accepts a list of strings specifying which layers of the model to apply\\n LoRA to:\\n\\n * ``q_proj`` applies LoRA to the query projection layer.\\n * ``k_proj`` applies LoRA to the key projection layer.\\n * ``v_proj`` applies LoRA to the value projection layer.\\n * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n Whilst adding more layers to be fine-tuned may improve model accuracy,\\n this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n This is usually a projection to vocabulary space \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. in language models\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, but\\n other modelling tasks may have different projections - classifier models will project\\n to the number of classes, for example\\n\\n.. note::\\n\\n Models which use tied embeddings \u001b[0m\u001b[32m(\u001b[0m\u001b[32msuch as Gemma and Qwen2 1.5B and 0.5B\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for the\\n final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.lora_llama3_8b\\n apply_lora_to_mlp: True\\n model.lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of the LoRA decomposition, where ``lora_rank << in_dim`` and ``lora_rank << out_dim``\\n \\\\- the dimensions of an arbitrary linear layer in the model. Concretely, ``lora_rank`` reduces the number of gradients stored\\n in a linear fashion from ``in_dim * out_dim`` to ``lora_rank * \u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim + out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m``. Typically, we have ``lora_rank in \u001b[0m\u001b[32m[\u001b[0m\u001b[32m8, 256\u001b[0m\u001b[32m]\u001b[0m\u001b[32m``.\\n* ``lora_alpha: float`` affects the magnitude of the LoRA updates. A larger alpha results in larger updates to the base model weights\\n , potentially at the cost of training stability, conversely, smaller alpha can stabilize training at the cost of slower learning.\\n We provide default settings for these parameters which we\\'ve tested with all of our models, but we encourage you to adjust them\\n to your specific use case. Typically, one jointly changes ``lora_rank`` and ``lora_alpha`` together, where ``lora_alpha ~= 2*lora_rank``.\\n* ``lora_dropout`` introduces dropout in the LoRA layers to help regularize training. We default to 0.0 for all of our models.\\n\\nAs above, these parameters are also specified under the ``model`` flag or config entry:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m64\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 32\\n lora_alpha: 64\\n\\n.. note::\\n\\n To get a deeper sense of how LoRA parameters affect memory usage during training,\\n see the :ref:`relevant section in our Llama2 LoRA tutorial`.\\n\\n.. _glossary_qlora:\\n\\nQuantized Low Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQLoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`QLoRA `_ is a memory enhancement on top of `LoRA `_\\nthat maintains the frozen model parameters from LoRA in 4-bit quantized precision, thereby reducing memory usage.\\nThis is enabled through a novel 4-bit NormalFloat \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNF4\u001b[0m\u001b[32m)\u001b[0m\u001b[32m data type proposed by the authors, which allows for 4-8x less\\nparameter memory usage whilst retaining model accuracy. You can read our tutorial on :ref:`finetuning Llama2 with QLoRA`\\nfor a deeper understanding of how it works.\\n\\nWhen considering using QLoRA to reduce memory usage, it\\'s worth noting that QLoRA is slower than LoRA and may not be worth it if\\nthe model you are finetuning is small. In numbers, QLoRA saves roughly 1.5 bytes * \u001b[0m\u001b[32m(\u001b[0m\u001b[32m# of model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Also, although QLoRA quantizes the model,\\nit minimizes accuracy degradation by up-casting quantized parameters to the original higher precision datatype during model forward passes - this up-casting may incur penalties to training speed.\\nThe :ref:`relevant section ` in our QLoRA tutorial demonstrates the usage of ``torch.compile`` to address this by speeding up training.\\n\\n*Sounds great! How do I use it?*\\n\\nYou can finetune using QLoRA with any of our LoRA recipes, i.e. recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device`. These recipes utilize\\nQLoRA-enabled model builders, which we support for all our models, and also use the ``qlora_`` prefix, e.g.\\nthe :func:`torchtune.models.llama3.llama3_8b` model has a corresponding :func:`torchtune.models.llama3.qlora_llama3_8b`.\\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with QLoRA quickly,\\njust specify any config with ``_qlora`` in its name.\\n\\nAll the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA `\\nto see how to configure these parameters.\\n\\nTo configure from the command line:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m64\u001b[0m\u001b[32m\\n\\n\\nor, by modifying a config:\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.qlora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 32\\n lora_alpha: 64\\n\\n.. _glossary_dora:\\n\\nWeight-Decomposed Low-Rank Adaptation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDoRA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n*What\\'s going on here?*\\n\\n`DoRA `_ is another PEFT technique which builds on-top of LoRA by\\nfurther decomposing the pre-trained weights into two components: magnitude and direction. The magnitude component\\nis a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA decomposition and\\nupdates the orientation of weights.\\n\\nDoRA adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to\\nimprove the performance of LoRA, particularly at low ranks.\\n\\n*Sounds great! How do I use it?*\\n\\nMuch like LoRA and QLoRA, you can finetune using DoRA with any of our LoRA recipes. We use the same model builders for LoRA\\nas we do for DoRA, so you can use the ``lora_`` version of any model builder with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``. For example, to finetune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``\u001b[0m\u001b[32mquantize\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m16\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n model.\u001b[0m\u001b[32mquantize_base\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFSDP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n.. _chat_tutorial_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m=================================\\nFine-Tuning Llama3 with Chat Data\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m=================================\\n\\nLlama3 Instruct introduced a new prompt template for fine-tuning with chat data. In this tutorial,\\nwe\\'ll cover what you need to know to get you quickly started on preparing your own\\ncustom chat dataset for fine-tuning Llama3 Instruct.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:\\n\\n * How the Llama3 Instruct format differs from Llama2\\n * All about prompt templates and special tokens\\n * How to use your own chat dataset to fine-tune Llama3 Instruct\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`configuring datasets`\\n * Know how to :ref:`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you\\'ll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m/INST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `_\\nthe template from Llama2 to better support multiturn conversations. The same text\\nin the Llama3 Instruct format would look like this:\\n\\n.. code-block:: text\\n\\n <|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n Hello there! Nice to meet you! I\\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"system\",\\n \"content\": \"You are a helpful, respectful, and honest assistant.\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"user\",\\n \"content\": \"Who are the most influential hip-hop artists of all time?\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"role\": \"assistant\",\\n \"content\": \"Here is a list of some of the most influential hip-hop \"\\n \"artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\",\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nNow, let\\'s format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and\\nsee how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,\\nwhich simply structures a prompt with flavor text to indicate a certain task.\\n\\n.. code-block:: python\\n\\n from torchtune.data import Llama2ChatTemplate, Message\\n\\n messages = \u001b[0m\u001b[32m[\u001b[0m\u001b[32mMessage.from_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmsg\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for msg in sample\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n formatted_messages = Llama2ChatTemplate.format\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmessages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mformatted_messages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n # Message\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32mrole\u001b[0m\u001b[32m=\\'user\\',\\n # \u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m=\\'\u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m <>\\\\nYou are a helpful, respectful, and honest assistant.\\\\n<>\\\\n\\\\nWho are the most influential hip-hop artists of all time? \u001b[0m\u001b[32m[\u001b[0m\u001b[32m/INST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\',\\n # ...,\\n # \u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\n # Message\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32mrole\u001b[0m\u001b[32m=\\'assistant\\',\\n # \u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m=\\'Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.\\',\\n # ...,\\n # \u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\n # \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThere are also special tokens used by Llama2, which are not in the prompt template.\\nIf you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you\\'ll notice that\\nwe don\\'t include the :code:`` and :code:`` tokens. These are the beginning-of-sequence\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mBOS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and end-of-sequence \u001b[0m\u001b[32m(\u001b[0m\u001b[32mEOS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m tokens that are represented differently in the tokenizer\\nthan the rest of the prompt template. Let\\'s tokenize this example with the\\n:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see\\nwhy.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_tokenizer\\n\\n tokenizer = llama2_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Llama-2-7b-hf/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n user_message = formatted_messages\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.text_content\\n tokens = tokenizer.encode\u001b[0m\u001b[32m(\u001b[0m\u001b[32muser_message, \u001b[0m\u001b[32madd_bos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32madd_eos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokens\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nWe\\'ve added the BOS and EOS tokens when encoding our example text. This shows up\\nas IDs 1 and 2. We can verify that these are our BOS and EOS tokens.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer._spm_model.spm_model.piece_to_id\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 1\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer._spm_model.spm_model.piece_to_id\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 2\\n\\nThe BOS and EOS tokens are what we call special tokens, because they have their own\\nreserved token IDs. This means that they will index to their own individual vectors in\\nthe model\\'s learnt embedding table. The rest of the prompt template tags, :code:`\u001b[0m\u001b[32m[\u001b[0m\u001b[32mINST\u001b[0m\u001b[32m]\u001b[0m\u001b[32m`\\nand :code:`<>` are tokenized as normal text and not their own IDs.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m518\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m25580\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'INST\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m29962\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\'\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3532, 14816, 29903, 6778\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'<>\\'\\n\\nIt\\'s important to note that you should not place the special reserved tokens in your\\ninput prompts manually, as it will be treated as normal text and not as a special\\ntoken.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.encode\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"\", \u001b[0m\u001b[32madd_bos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m, \u001b[0m\u001b[32madd_eos\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \u001b[0m\u001b[32m[\u001b[0m\u001b[32m529, 29879, 29958\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nNow let\\'s take a look at Llama3\\'s formatting to see how it\\'s tokenized differently\\nthan Llama2.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n messages = \u001b[0m\u001b[32m[\u001b[0m\u001b[32mMessage.from_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmsg\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for msg in sample\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n tokens, mask = tokenizer.tokenize_messages\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmessages\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.decode\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokens\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # \\'<|start_header_id|>system<|end_header_id|>\\\\n\\\\nYou are a helpful, respectful,\\n # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\\\n\\\\nWho\\n # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>\\n # assistant<|end_header_id|>\\\\n\\\\nHere is a list of some of the most influential hip-hop\\n # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>\\'\\n\\n.. note::\\n We used the ``tokenize_messages`` API for Llama3, which is different than\\n encode. It simply manages adding all the special tokens in the correct\\n places after encoding the individual messages.\\n\\nWe can see that the tokenizer handled all the formatting without us specifying a prompt\\ntemplate. It turns out that all of the additional tags are special tokens, and we don\\'t require\\na separate prompt template. We can verify this by checking if the tags get encoded\\nas their own token IDs.\\n\\n.. code-block:: python\\n\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.special_tokens\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"<|begin_of_text|>\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 128000\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtokenizer.special_tokens\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"<|eot_id|>\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # 128009\\n\\nThe best part is - all these special tokens are handled purely by the tokenizer.\\nThat means you won\\'t have to worry about messing up any required prompt templates!\\n\\n\\nWhen should I use a prompt template?\\n------------------------------------\\n\\nWhether or not to use a prompt template is governed by what your desired inference\\nbehavior is. You should use a prompt template if you are running inference on the\\nbase model and it was pre-trained with a prompt template, or you want to prime a\\nfine-tuned model to expect a certain prompt structure on inference for a specific task.\\n\\nIt is not strictly necessary to fine-tune with a prompt template, but generally\\nspecific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`\\nprovides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.\\nThis would wrap around the user message, with the assistant message untouched.\\n\\n.. code-block:: python\\n\\n f\"Summarize this dialogue:\\\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mdialogue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\\\n---\\\\nSummary:\\\\n\"\\n\\nYou can fine-tune Llama2 with this template even though the model was originally pre-trained\\nwith the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model\\nsees during inference. The model should be robust enough to adapt to a new template.\\n\\n\\nFine-tuning on a custom chat dataset\\n------------------------------------\\n\\nLet\\'s test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom\\nchat dataset. We\\'ll walk through how to set up our data so that it can be tokenized\\ncorrectly and fed into our model.\\n\\nLet\\'s say we have a local dataset saved as a JSON file that contains conversations\\nwith an AI model. How can we get something like this into a format\\nLlama3 understands and tokenizes correctly?\\n\\n.. code-block:: python\\n\\n # data/my_data.json\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"dialogue\": \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"human\",\\n \"value\": \"What is your name?\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"gpt\",\\n \"value\": \"I am an AI assistant, I don\\'t have a name.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"human\",\\n \"value\": \"Pretend you have a name.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n \"from\": \"gpt\",\\n \"value\": \"My name is Mark Zuckerberg.\"\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m}\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nLet\\'s first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we\\nhave conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n ds = chat_dataset\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32msource\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"json\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mdata_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"data\u001b[0m\u001b[32m/my_data.json\",\\n \u001b[0m\u001b[32msplit\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"train\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mconversation_column\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"dialogue\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32mconversation_style\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"sharegpt\"\u001b[0m\u001b[32m,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default \u001b[0m\u001b[32m(\u001b[0m\u001b[32m:class:`~torchtune.models.mistral.MistralChatTemplate`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to format\\nall messages according to their `recommendations `_.\\n\\nNow we\\'re ready to start fine-tuning! We\\'ll use the built-in LoRA single device recipe.\\nUse the :ref:`tune cp ` command to get a copy of the :code:`8B_lora_single_device.yaml`\\nconfig and update it with your dataset configuration.\\n\\nLaunch the fine-tune!\\n\\n.. code-block:: bash\\n\\n $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml \u001b[0m\u001b[32mepochs\u001b[0m\u001b[32m=\u001b[0m\u001b[32m15\u001b[0m\u001b[32m\\n\\n.. _llama3_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m========================\\nMeta Llama3 in torchtune\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m========================\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:\\n\\n * Download the Llama3-8B-Instruct weights and tokenizer\\n * Fine-tune Llama3-8B-Instruct with LoRA and QLoRA\\n * Evaluate your fine-tuned Llama3-8B-Instruct model\\n * Generate text with your fine-tuned model\\n * Quantize your model to speed up generation\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n\\n\\nLlama3-8B\\n---------\\n\\n`Meta Llama 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size \u001b[0m\u001b[32m(\u001b[0m\u001b[32m128,256 instead of 32,000 from Llama2 models\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n- Llama3-8B uses a different tokenizer than Llama2 models \u001b[0m\u001b[32m(\u001b[0m\u001b[32m`tiktoken `_ instead of `sentencepiece `_\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let\\'s download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3-8B-Instruct \\\\\\n --output-dir \\\\\\n --hf-token \\n\\n|\\n\\nFine-tuning Llama3-8B-Instruct in torchtune\\n-------------------------------------------\\n\\ntorchtune provides `LoRA `_, `QLoRA `_, and full fine-tuning\\nrecipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial `.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial `.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides ` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n checkpointer.\u001b[0m\u001b[32mcheckpoint_dir\u001b[0m\u001b[32m= \\\\\\n tokenizer.\u001b[0m\u001b[32mpath\u001b[0m\u001b[32m=/tokenizer.model \\\\\\n checkpointer.\u001b[0m\u001b[32moutput_dir\u001b[0m\u001b[32m=\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ```` used in the :ref:`tune download ` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive `.\\n\\n.. note::\\n To see the full set of configurable parameters for this \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand other\u001b[0m\u001b[32m)\u001b[0m\u001b[32m configs we can use :ref:`tune cp ` to copy \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand modify\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n the default config. :ref:`tune cp ` can be used with recipe scripts too, in case you want to make more custom changes\\n that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp ` see the section on\\n :ref:`modifying configs ` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmuch smaller\u001b[0m\u001b[32m)\u001b[0m\u001b[32m LoRA weights\\nwill be saved separately.\\n\\nIn our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.\\n\\nIf you have multiple GPUs available, you can run the distributed version of the recipe.\\ntorchtune makes use of the `FSDP `_ APIs from PyTorch Distributed\\nto shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster overall training.\\nFor example, on two devices:\\n\\n.. code-block:: bash\\n\\n tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora\\n\\nFinally, if we want to use even less memory, we can leverage torchtune\\'s QLoRA recipe via:\\n\\n.. TODO \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSalmanMohammadi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ref qlora recipe page\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_qlora_single_device\\n\\nSince our default configs enable full bfloat16 training, all of the above commands can be run with\\ndevices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory\\nbelow 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.\\nTry it out!\\n\\n|\\n\\nEvaluating fine-tuned Llama3-8B models with EleutherAI\\'s Eval Harness\\n---------------------------------------------------------------------\\n\\nNow that we\\'ve fine-tuned our model, what\\'s next? Let\\'s take our LoRA-finetuned model from the\\npreceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.\\n\\nFirst, torchtune provides an integration with\\n`EleutherAI\\'s evaluation harness `_\\nfor model evaluation on common benchmark tasks.\\n\\n.. note::\\n Make sure you\\'ve first installed the evaluation harness via :code:`pip install \"\u001b[0m\u001b[32mlm_eval\u001b[0m\u001b[32m==0.4.*\"`.\\n\\nFor this tutorial we\\'ll use the `truthfulqa_mc2 `_ task from the harness.\\nThis task measures a model\\'s propensity to be truthful when answering questions and\\nmeasures the model\\'s zero-shot accuracy on a question followed by one or more true\\nresponses and one or more false responses. First, let\\'s copy the config so we can point the YAML\\nfile to our fine-tuned checkpoint files.\\n\\n.. code-block:: bash\\n\\n tune cp eleuther_evaluation ./custom_eval_config.yaml\\n\\nNext, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n # directory with the checkpoint files\\n # this should match the output_dir specified during\\n # fine-tuning\\n checkpoint_dir: \\n\\n # checkpoint files for the fine-tuned model. These will be logged\\n # at the end of your fine-tune\\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n meta_model_0.pt\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n output_dir: \\n model_type: LLAMA3\\n\\n # Make sure to update the tokenizer path to the right\\n # checkpoint directory as well\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tokenizer.model\\n\\nFinally, we can run evaluation using our modified config.\\n\\n.. code-block:: bash\\n\\n tune run eleuther_eval --config ./custom_eval_config.yaml\\n\\nTry it for yourself and see what accuracy your model gets!\\n\\n|\\n\\nGenerating text with our fine-tuned Llama3 model\\n------------------------------------------------\\n\\n.. TODO \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSalmanMohammadi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ref generate recipe page\\n\\nNext, let\\'s look at one other way we can evaluate our model: generating text! torchtune provides a\\n`recipe for generation `_ as well.\\n\\nSimilar to what we did, let\\'s copy and modify the default generation config.\\n\\n.. code-block:: bash\\n\\n tune cp generation ./custom_generation_config.yaml\\n\\nNow we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n\\n # directory with the checkpoint files\\n # this should match the output_dir specified during\\n # fine-tuning\\n checkpoint_dir: \\n\\n # checkpoint files for the fine-tuned model. These will be logged\\n # at the end of your fine-tune\\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\n meta_model_0.pt\\n \u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n output_dir: \\n model_type: LLAMA3\\n\\n # Make sure to update the tokenizer path to the right\\n # checkpoint directory as well\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tokenizer.model\\n\\nRunning generation with our LoRA-finetuned model, we see the following output:\\n\\n.. code-block:: bash\\n\\n tune run generate --config ./custom_generation_config.yaml \\\\\\n prompt.\u001b[0m\u001b[32muser\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"Hello\u001b[0m\u001b[32m, my name is\"\\n\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:122\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.\\n ...\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:135\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Time for inference: 10.88 sec total, 18.94 tokens/sec\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:138\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Bandwidth achieved: 346.09 GB/s\\n \u001b[0m\u001b[32m[\u001b[0m\u001b[32mgenerate.py:139\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Memory used: 18.31 GB\\n\\nFaster generation via quantization\\n----------------------------------\\n\\nWe rely on `torchao `_ for `post-training quantization `_.\\nTo quantize the fine-tuned model after installing torchao we can run the following command::\\n\\n # we also support `int8_weight_only\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and `int8_dynamic_activation_int8_weight\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`, see\\n # https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques\\n # for a full list of techniques that we support\\n from torchao.quantization.quant_api import quantize_, int4_weight_only\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel, int4_weight_only\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nAfter quantization, we rely on torch.compile for speedups. For more details, please see `this example usage `_.\\n\\ntorchao also provides `this table `_ listing performance and accuracy results for ``llama2`` and ``llama3``.\\n\\nFor Llama models, you can run generation directly in torchao on the quantized model using their ``generate.py`` script as\\ndiscussed in `this readme `_. This way you can compare your own results\\nto those in the previously-linked table.\\n\\n\\nThis is just the beginning of what you can do with Meta Llama3 using torchtune and the broader ecosystem.\\nWe look forward to seeing what you build!\\n\\n404: Not Found\\n.. _qat_finetune_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m===========================\\nFine-Tuning Llama3 with QAT\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m===========================\\n\\nQuantization-Aware Training \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQAT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is a common technique for users to quantize their\\nmodels without incurring significant degradations in accuracy or perplexity. In this\\ntutorial, we’ll walk through how to apply QAT during fine-tuning, quantize the\\nresulting model, and evaluate your quantized model using torchtune.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What QAT is and how it helps reduce quantization degradation\\n * How to run QAT during fine-tuning in torchtune\\n * End-to-end example of connecting QAT, quantization, and evaluation recipes\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama3-8B model weights`\\n\\n.. _what_is_qat_label:\\n\\nWhat is QAT?\\n------------\\n\\n`Quantization-Aware Training `_ \u001b[0m\u001b[32m(\u001b[0m\u001b[32mQAT\u001b[0m\u001b[32m)\u001b[0m\u001b[32m refers to simulating quantization numerics during\\ntraining or fine-tuning, with the end goal of ultimately producing a higher quality\\nquantized model compared to simple post-training quantization \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPTQ\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. During QAT,\\nthe weights and/or activations are “fake quantized”, meaning they are transformed\\nas if they were being quantized, but kept in the original data type \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. bfloat16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nwithout being actually cast to lower bit-widths. Thus, fake quantization allows the\\nmodel to adjust for quantization noise when updating the weights, hence the training\\nprocess is “aware” that the model will ultimately be quantized after training.\\n\\n.. code-block:: python\\n\\n # PTQ: x_q is quantized and cast to int8\\n # scale and zero point \u001b[0m\u001b[32m(\u001b[0m\u001b[32mzp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m refer to parameters used to quantize x_float\\n # qmin and qmax refer to the range of quantized values\\n x_q = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_float / scale + zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.round\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.clamp\u001b[0m\u001b[32m(\u001b[0m\u001b[32mqmin, qmax\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.cast\u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # QAT: x_fq is still in float\\n # Fake quantize simulates the numerics of quantize + dequantize\\n x_fq = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_float / scale + zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.round\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.clamp\u001b[0m\u001b[32m(\u001b[0m\u001b[32mqmin, qmax\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n x_fq = \u001b[0m\u001b[32m(\u001b[0m\u001b[32mx_fq - zp\u001b[0m\u001b[32m)\u001b[0m\u001b[32m * scale\\n\\nQAT typically involves applying a transformation to your model before and after training.\\nFor example, in the `torchao QAT implementation `_,\\nthese are represented as the ``prepare\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` and ``convert\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` steps: \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ``prepare\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` inserts fake quantize\\noperations into linear layers, and \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m ``convert\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`` transforms the fake quantize operations\\nto actual quantize and dequantize operations after training, thereby producing a quantized\\nmodel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdequantize operations are typically fused with linear after lowering\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\nBetween these two steps, training can proceed exactly as before.\\n\\n.. image:: /_static/img/qat_diagram.png\\n\\n.. _apply_qat_label:\\n\\nApplying QAT to Llama3 models\\n-----------------------------\\n\\nWe can easily apply the above QAT transformations to Llama3 for fine-tuning,\\nleveraging the APIs in torchao as follows:\\n\\n.. code-block:: python\\n\\n import copy\\n import torch\\n from torchao.quantization import quantize_\\n from torchao.quantization.qat import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n FakeQuantizeConfig,\\n IntXQuantizationAwareTrainingConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n from torchtune.models.llama3 import llama3_8b\\n\\n model = llama3_8b\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n original_model = copy.deepcopy\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Config for int8 dynamic asymmetric per token activations +\\n # int4 symmetric per group weights, only for linear layers\\n activation_config = FakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtorch.int8, \"per_token\", \u001b[0m\u001b[32mis_symmetric\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n weight_config = FakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mtorch.int4, \u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n qat_config = IntXQuantizationAwareTrainingConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_config, weight_config\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Prepare the model for quantization-aware fine-tuning.\\n #\\n # This step inserts \"fake quantize\" ops that simulate\\n # quantization numerics during fine-tuning without\\n # actually casting the activations/weights to lower-bit\\n # dtypes like in \"real\" quantization.\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mmodel, qat_config\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n prepared_model = model\\n\\nThe model is now ready for QAT fine-tuning! If we print the model we’ll see that\\nall linear layers have been swapped with :code:`FakeQuantizedLinear`, which simulates\\nthe numerics of int8 dynamic asymmetric per token activations + int4 symmetric\\nper group weights:\\n\\n.. code-block:: bash\\n\\n >>> original_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. code-block:: bash\\n\\n >>> prepared_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizedLinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerToken\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mweight_fake_quantizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: FakeQuantizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFakeQuantizeConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mdtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int4, \u001b[0m\u001b[32mgranularity\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPerGroup\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mmapping_type\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mscale_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.float32, \u001b[0m\u001b[32mzero_point_precision\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int32, \u001b[0m\u001b[32mzero_point_domain\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mis_dynamic\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m, \u001b[0m\u001b[32mrange_learning\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nAfter fine-tuning, we can convert the model to get an actual quantized model:\\n\\n.. code-block:: python\\n\\n from torchao.quantization.qat import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n FromIntXQuantizationAwareTrainingConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n from torchao.quantization import \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n Int8DynamicActivationInt4WeightConfig,\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Fine-tune as before\\n train_loop\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Convert the fake quantized model into an actual quantized model\\n #\\n # First, we swap `FakeQuantizedLinear` back to `torch.nn.Linear`\\n # while keeping the QAT fine-tuned weights. Then, we perform standard\\n # post-training quantization \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPTQ\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, which inserts quantized activation\\n # and weight tensor subclasses\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model, FromIntXQuantizationAwareTrainingConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n quantize_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mprepared_model, Int8DynamicActivationInt4WeightConfig\u001b[0m\u001b[32m(\u001b[0m\u001b[32mgroup_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n converted_model = prepared_model\\n\\nThe model is now fully quantized to int8 and int4 and ready for inference\\nor generation. If we print the model now, we will see the linear layers\\nare now swapped back to :code:`torch.nn.Linear`, but with quantized tensor\\nactivations and weights:\\n\\n.. code-block:: bash\\n\\n >>> converted_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4096, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1024, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1024\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1024, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mLinearActivationQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mactivation\u001b[0m\u001b[32m=, \u001b[0m\u001b[32mweight\u001b[0m\u001b[32m=\u001b[0m\u001b[32mAffineQuantizedTensor\u001b[0m\u001b[32m(\u001b[0m\u001b[32mshape\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.Size\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4096, 4096\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mblock_size\u001b[0m\u001b[32m=\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1, 32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mdevice\u001b[0m\u001b[32m=\u001b[0m\u001b[32mcpu\u001b[0m\u001b[32m, \u001b[0m\u001b[32m_layout\u001b[0m\u001b[32m=\u001b[0m\u001b[32mPlainLayout\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mtensor_impl_dtype\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtorch\u001b[0m\u001b[32m.int8, \u001b[0m\u001b[32mquant_min\u001b[0m\u001b[32m=-8, \u001b[0m\u001b[32mquant_max\u001b[0m\u001b[32m=\u001b[0m\u001b[32m7\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n\\nQAT finetuning recipe in torchtune\\n----------------------------------\\n\\nPutting it all together, we can now fine-tune a model using torchtune’s :ref:`QAT recipe`.\\nMake sure that you have first downloaded the Llama3 weights and tokenizer by\\nfollowing :ref:`these instructions`. In this tutorial,\\nwe use the following settings to demonstrate QAT’s effectiveness in recovering\\nquantization degradation compared to directly quantizing a model fine-tuned\\nwithout QAT. You can copy the default QAT config and make the following\\nmodifications accordingly:\\n\\n.. code-block:: bash\\n\\n tune cp llama3/8B_qat_full custom_8B_qat_full.yaml\\n\\n.. code-block:: yaml\\n\\n dataset:\\n _component_: torchtune.datasets.text_completion_dataset\\n source: allenai/c4\\n column: text\\n name: en\\n split: train\\n\\n ...\\n\\n epochs: 1\\n max_steps_per_epoch: 2000\\n fake_quant_after_n_steps: 1000\\n\\nBy default, this uses the :code:`torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer`,\\nwhich uses the same fake quantization configurations as the example above.\\n\\nEmpirically, we observed that disabling fake quantization for the first N steps\\nled to better results, presumably because doing so allows the weights to stabilize\\nbefore we start introducing quantization noise to the fine-tuning process.\\nFor this reason, here we disable fake quantization for the first 1000 steps.\\n\\nYou can then use the following command to run fine-tuning with QAT using the above\\nconfig. This workload requires at least 6 GPUs, each with VRAM of at least 80GB.\\nBy default, this uses the int8 dynamic per token activations + int4 grouped per\\nchannel weights quantization configuration as shown above:\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 6 qat_distributed --config custom_8B_qat_full.yaml\\n\\n.. note::\\n\\n Make sure to point to the location of your Llama3 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.\u001b[0m\u001b[32mcheckpoint_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32mmy_model_checkpoint_path\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32mtokenizer_checkpoint\u001b[0m\u001b[32m=\u001b[0m\u001b[32mmy_tokenizer_checkpoint_path\u001b[0m\u001b[32m`\\n or by directly modifying the :code:`8B_qat_full.yaml` file. See our :ref:`config_tutorial_label`\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n\\n QAT introduces memory and computation overheads compared to regular fine-tuning,\\n since fake quantization fundamentally involves extra ops and requires cloning\\n the weights to avoid mutating them when computing the fake quantized values.\\n In general, we expect around 30% decrease in fine-tuning speed for models like\\n Llama3-8B. With activation checkpointing, the increase in memory footprint per\\n GPU is minimal \u001b[0m\u001b[32m(\u001b[0m\u001b[32m< 5GB per GPU\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n\\nQuantizing the QAT model\\n------------------------\\n\\nNote that the QAT recipe above produces an unquantized bfloat16 model. The model\\nstructure is exactly the same as the model produced with regular full fine-tuning\\nwithout QAT, just with different weights. To actually get a quantized model,\\ncopy and make the following modifications to the quantization config:\\n\\n.. code-block:: bash\\n\\n tune cp quantization custom_quantization.yaml\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelMetaCheckpointer\\n checkpoint_dir: \\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32mft-model-00001-of-00001.bin\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n output_dir: \\n model_type: LLAMA3\\n\\n ...\\n\\n quantizer:\\n _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer\\n groupsize: 256\\n\\nThe following command performs the convert step in the QAT flow, which actually\\nquantizes the float model to a model with quantized weights:\\n\\n.. code-block:: bash\\n\\n tune run quantize --config custom_quantization.yaml\\n\\n.. note::\\n\\n Make sure to use the same QAT quantizer you used to fine-tune your model,\\n otherwise the numerics will be off and the quantized model will perform poorly.\\n\\n.. _qat_eval_label:\\n\\nEvaluating the quantized model\\n------------------------------\\n\\nNow that we have a quantized model, we can run some evaluations on it and compare the\\nresults against regular fine-tuning without QAT \u001b[0m\u001b[32m(\u001b[0m\u001b[32mi.e. post-training quantization\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\nTo achieve this, we use `EleutherAI’s evaluation harness `_\\nintegrated in torchtune. First, copy the evaluation config and make the following changes:\\n\\n.. code-block:: bash\\n\\n tune cp eleuther_evaluation custom_eleuther_evaluation.yaml\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.llama3.llama3_8b\\n\\n checkpointer:\\n _component_: torchtune.training.FullModelTorchTuneCheckpointer\\n checkpoint_dir: \\n checkpoint_files: \u001b[0m\u001b[32m[\u001b[0m\u001b[32mft-model-00001-of-00001-8da4w.bin\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n output_dir: \\n model_type: LLAMA3\\n\\n ...\\n\\n tasks: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"hellaswag\", \"wikitext\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n quantizer:\\n _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer\\n groupsize: 256\\n\\n.. note::\\n\\n Since we are passing in a quantized model, be sure to use the corresponding\\n post-training quantizer instead of the QAT quantizer. For example, if you\\n used the :code:`Int8DynActInt4WeightQATQuantizer` during fine-tuning, you\\n should specify :code:`Int8DynActInt4WeightQuantizer` in this step. See the\\n `quantization recipe `_\\n for a full list of supported quantizers.\\n\\nNow run the evaluation recipe:\\n\\n.. code-block:: bash\\n\\n tune run eleuther_eval --config my_eleuther_evaluation.yaml\\n\\nThe results should look something like this:\\n\\n.. code-block:: bash\\n\\n # QAT quantized model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 activations + int4 weights\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|\\n |---------|------:|------|-----:|---------------|-----:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|9.9148|± |N/A |\\n | | |none | 0|byte_perplexity|1.5357|± |N/A |\\n | | |none | 0|bits_per_byte |0.6189|± |N/A |\\n |hellaswag| 1|none | 0|acc |0.5687|± |0.0049|\\n | | |none | 0|acc_norm |0.7536|± |0.0043|\\n\\nComparing these results to the model fine-tuned without QAT, we can see that\\nQAT was able to recover a significant portion of the quantization degradations\\nfrom the original unquantized model compared to PTQ. For example, normalized\\naccuracy in the hellaswag task dropped by 2.20% with PTQ but only 0.74% with\\nQAT when compared to the original unquantized model. Similarly, word perplexity\\nin the wikitext task increased by 2.048 with PTQ but only 1.190 with QAT \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlower\\nis better\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n.. code-block:: bash\\n\\n # PTQ quantized model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 activations + int4 weights\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric | Value | |Stderr|\\n |---------|------:|------|-----:|---------------|------:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|10.7735|± |N/A |\\n | | |none | 0|byte_perplexity| 1.5598|± |N/A |\\n | | |none | 0|bits_per_byte | 0.6413|± |N/A |\\n |hellaswag| 1|none | 0|acc | 0.5481|± |0.0050|\\n | | |none | 0|acc_norm | 0.7390|± |0.0044|\\n\\n.. code-block:: bash\\n\\n # Float model evaluation results \u001b[0m\u001b[32m(\u001b[0m\u001b[32mbfloat16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|\\n |---------|------:|------|-----:|---------------|-----:|---|------|\\n |wikitext | 2|none | 0|word_perplexity|8.7251|± |N/A |\\n | | |none | 0|byte_perplexity|1.4994|± |N/A |\\n | | |none | 0|bits_per_byte |0.5844|± |N/A |\\n |hellaswag| 1|none | 0|acc |0.5740|± |0.0049|\\n | | |none | 0|acc_norm |0.7610|± |0.0043|\\n\\nThus, the QAT flow produced a quantized model that outperforms the post-training\\nquantized model. Importantly, the quantized model structure is identical in both\\nflows, and so the model size, memory usage, and all other performance\\ncharacteristics are also the same.\\n\\nNote that although the weights are quantized to int4, the quantized model size\\nfor both the QAT and the PTQ flows are 8.187 GB, while the original float model\\nis 14.958 GB. This is because this quantizer uses int8 to represent the weights\\nas PyTorch does not have native int4 dtype support. A more efficient representation\\nis to pack the int4 weights, which will halve the quantized model size. This is\\nwhat the Int4WeightOnlyQuantizer does, and the corresponding QAT quantizer will\\nbe added in the future.\\n\\nLowering QAT model to device \u001b[0m\u001b[32m(\u001b[0m\u001b[32moptional\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n---------------------------------------\\n\\nOne important motivation for quantizing a model is to be able to run it in resource\\nconstrained environments. You can further lower your QAT Llama3 model to edge devices\\nsuch as smartphones using `executorch `_ by\\nfollowing `these instructions `_.\\nFor example, the following command lowers the model to the XNNPACK backend:\\n\\n.. code-block:: bash\\n\\n python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 256 -d fp32 --metadata \\'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"get_bos_id\":128000, \"get_eos_id\":128001\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\' --embedding-quantize 4,32 --\u001b[0m\u001b[32moutput_name\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"llama3_8da4w\u001b[0m\u001b[32m.pte\"\\n\\nThis results in a much smaller quantized model of size 3.881 GB. When benchmarked on a OnePlus 12 smartphone, this model also achieved the same inference and generation speeds as the post-training quantized model. This is because the model structures are the same across the two flows:\\n\\n.. list-table::\\n :widths: 25 25 25\\n :header-rows: 1\\n\\n * -\\n - QAT\\n - PTQ\\n * - Quantized model size\\n - 3.881 GB\\n - 3.881 GB\\n * - Inference speed\\n - 9.709 tok/s\\n - 9.815 tok/s\\n * - Generation speed\\n - 11.316 tok/s\\n - 11.364 tok/s\\n\\n.. _lora_finetune_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\nFine-Tuning Llama2 with LoRA\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network\\'s remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer\\'s self-attention.\\n\\n.. note::\\n\\n If you\\'re unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas opposed to finetuning all model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `_,\\nyou can expect to see further memory savings from the optimizer state.\\n\\n.. note::\\n\\n LoRA memory savings come primarily from gradient and optimizer states,\\n so if your model\\'s peak memory comes in its :code:`forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` method, then LoRA\\n may not reduce peak memory.\\n\\nHow does LoRA work?\\n-------------------\\n\\nLoRA replaces weight update matrices with a low-rank approximation. In general, weight updates\\nfor an arbitrary :code:`nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim,out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` layer could have rank as high as\\n:code:`min\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim,out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand other related papers such as `Aghajanyan et al. `_\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nhypothesize that the `intrinsic dimension `_\\nof these updates during LLM fine-tuning can in fact be much lower.\\nTo take advantage of this property, LoRA finetuning will freeze the original model,\\nthen add a trainable weight update from a low-rank projection. More explicitly, LoRA trains two\\nmatrices :code:`A` and :code:`B`. :code:`A` projects the inputs down to a much smaller rank \u001b[0m\u001b[32m(\u001b[0m\u001b[32moften four or eight in practice\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and\\n:code:`B` projects back up to the dimension output by the original linear layer.\\n\\nThe image below gives a simplified representation of a single weight update step from a full finetune\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mon the left\u001b[0m\u001b[32m)\u001b[0m\u001b[32m compared to a weight update step with LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mon the right\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The LoRA matrices :code:`A` and :code:`B`\\nserve as an approximation to the full rank weight update in blue.\\n\\n.. image:: /_static/img/lora_diagram.png\\n\\nAlthough LoRA introduces a few extra parameters in the model :code:`forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`, only the :code:`A` and :code:`B` matrices are trainable.\\nThis means that with a rank :code:`r` LoRA decomposition, the number of gradients we need to store reduces\\nfrom :code:`in_dim*out_dim` to :code:`r*\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim+out_dim\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. \u001b[0m\u001b[32m(\u001b[0m\u001b[32mRemember that in general :code:`r`\\nis much smaller than :code:`in_dim` and :code:`out_dim`.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nFor example, in the 7B Llama2\\'s self-attention, :code:`\u001b[0m\u001b[32min_dim\u001b[0m\u001b[32m=\u001b[0m\u001b[32mout_dim\u001b[0m\u001b[32m=4096` for the Q, K,\\nand V projections. This means a LoRA decomposition of rank :code:`\u001b[0m\u001b[32mr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m` will reduce the number of trainable\\nparameters for a given projection from :math:`4096 * 4096 \\\\approx 15M` to :math:`8 * 8192 \\\\approx 65K`, a\\nreduction of over 99%.\\n\\nLet\\'s take a look at a minimal implementation of LoRA in native PyTorch.\\n\\n\\n.. code-block:: python\\n\\n import torch\\n from torch import nn\\n\\n class LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mnn.Module\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n def __init__\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n self,\\n in_dim: int,\\n out_dim: int,\\n rank: int,\\n alpha: float,\\n dropout: float\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n # These are the weights from the original pretrained model\\n self.linear = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim, out_dim, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # These are the new LoRA params. In general rank << in_dim, out_dim\\n self.lora_a = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_dim, rank, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n self.lora_b = nn.Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mrank, out_dim, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Rank and alpha are commonly-tuned hyperparameters\\n self.rank = rank\\n self.alpha = alpha\\n\\n # Most implementations also include some dropout\\n self.dropout = nn.Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # The original params are frozen, and only LoRA params are trainable.\\n self.linear.weight.requires_grad = False\\n self.lora_a.weight.requires_grad = True\\n self.lora_b.weight.requires_grad = True\\n\\n def forward\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself, x: torch.Tensor\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> torch.Tensor:\\n # This would be the output of the original model\\n frozen_out = self.linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # lora_a projects inputs down to the much smaller self.rank,\\n # then lora_b projects back up to the output dimension\\n lora_out = self.lora_b\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.lora_a\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Finally, scale by the alpha parameter \u001b[0m\u001b[32m(\u001b[0m\u001b[32mnormalized by rank\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n # and add to the original model\\'s outputs\\n return frozen_out + \u001b[0m\u001b[32m(\u001b[0m\u001b[32mself.alpha / self.rank\u001b[0m\u001b[32m)\u001b[0m\u001b[32m * lora_out\\n\\nThere are some other details around initialization which we omit here, but if you\\'d like to know more\\nyou can see our implementation in :class:`~torchtune.modules.peft.LoRALinear`.\\nNow that we understand what LoRA is doing, let\\'s look at how we can apply it to our favorite models.\\n\\nApplying LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.\\n # We can also set \u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m or \u001b[0m\u001b[32mapply_lora_to_output\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer\\'s self-attention in the usual Llama2 model\\n >>> print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mbase_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model.layers\u001b[0m\u001b[32m[\u001b[0m\u001b[32m0\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.attn\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n MultiHeadAttention\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mq_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32m0\u001b[0m\u001b[32m.0, \u001b[0m\u001b[32minplace\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_a\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_b\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mk_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mv_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: LoRALinear\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdropout\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Dropout\u001b[0m\u001b[32m(\u001b[0m\u001b[32mp\u001b[0m\u001b[32m=\u001b[0m\u001b[32m0\u001b[0m\u001b[32m.0, \u001b[0m\u001b[32minplace\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_a\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_b\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m8\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutput_proj\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: Linear\u001b[0m\u001b[32m(\u001b[0m\u001b[32min_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mout_features\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4096\u001b[0m\u001b[32m, \u001b[0m\u001b[32mbias\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpos_embeddings\u001b[0m\u001b[32m)\u001b[0m\u001b[32m: RotaryPositionalEmbeddings\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n\\nNotice that our LoRA model\\'s layer contains additional weights in the Q and V projections,\\nas expected. Additionally, inspecting the type of :code:`lora_model` and\\n:code:`base_model`, would show that they are both instances of the same :class:`~torchtune.modules.TransformerDecoder`.\\n\u001b[0m\u001b[32m(\u001b[0m\u001b[32mFeel free to verify this for yourself.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nWhy does this matter? torchtune makes it easy to load checkpoints for LoRA directly from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mbase_model.state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. note::\\n Whenever loading weights with :code:`\u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m `.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Set \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m on lora_params, and \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m on all others.\\n set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n # Print the total number of parameters\\n total_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n trainable_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m if p.requires_grad\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n print\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n f\"\"\"\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtotal_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m total params,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtrainable_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\" trainable params,\\n \u001b[0m\u001b[32m{\u001b[0m\u001b[32m(\u001b[0m\u001b[32m100.0 * trainable_params / total_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:.2f\u001b[0m\u001b[32m}\u001b[0m\u001b[32m% of all params are trainable.\\n \"\"\"\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas detailed :ref:`here`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs \u001b[0m\u001b[32m(\u001b[0m\u001b[32meach having VRAM of at least 16GB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.\u001b[0m\u001b[32mcheckpoint_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32mmy_model_checkpoint_path\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32mtokenizer_checkpoint\u001b[0m\u001b[32m=\u001b[0m\u001b[32mmy_tokenizer_checkpoint_path\u001b[0m\u001b[32m`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma\u001b[0m\u001b[32m)\u001b[0m\u001b[32m the number of GPUs you have available,\\n and \u001b[0m\u001b[32m(\u001b[0m\u001b[32mb\u001b[0m\u001b[32m)\u001b[0m\u001b[32m the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'q_proj\\', \\'v_proj\\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<`_\\nfloating-point format. This can be done via the command:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama2/7B_lora_single_device\\n\\nOn a single device, we may need to be more cognizant of our peak memory. Let\\'s run a few experiments\\nto see our peak memory during a finetune. We will experiment along two axes:\\nfirst, which model layers have LoRA applied, and second, the rank of each LoRA layer. \u001b[0m\u001b[32m(\u001b[0m\u001b[32mWe will scale\\nalpha in parallel to LoRA rank, as discussed above.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nTo compare the results of our experiments, we can evaluate our models on `truthfulqa_mc2 `_, a task from\\nthe `TruthfulQA `_ benchmark for language models. For more details on how to run this and other evaluation tasks\\nwith torchtune\\'s EleutherAI evaluation harness integration, see our :ref:`End-to-End Workflow Tutorial `.\\n\\nPreviously, we only enabled LoRA for the linear layers in each self-attention module, but in fact there are other linear\\nlayers we can apply LoRA to: MLP layers and our model\\'s final output projection. Note that for Llama-2-7B the final output\\nprojection maps to the vocabulary dimension \u001b[0m\u001b[32m(\u001b[0m\u001b[32m32000 instead of 4096 as in the other linear layers\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, so enabling LoRA for this layer will increase\\nour peak memory a bit more than the other layers. We can make the following changes to our config:\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\\'q_proj\\', \\'k_proj\\', \\'v_proj\\', \\'output_proj\\'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n apply_lora_to_mlp: True\\n apply_lora_to_output: True\\n ...\\n\\n.. note::\\n All the finetuning runs below use the `llama2/7B_lora_single_device `_\\n config, which has a default batch size of 2. Modifying the batch size \u001b[0m\u001b[32m(\u001b[0m\u001b[32mor other hyperparameters, e.g. the optimizer\u001b[0m\u001b[32m)\u001b[0m\u001b[32m will impact both peak memory\\n and final evaluation results.\\n\\n.. list-table::\\n :widths: 25 25 25 25 25\\n :header-rows: 1\\n\\n * - LoRA Layers\\n - Rank\\n - Alpha\\n - Peak Memory\\n - Accuracy \u001b[0m\u001b[32m(\u001b[0m\u001b[32mtruthfulqa_mc2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n * - Q and V only\\n - 8\\n - 16\\n - **15.57 GB**\\n - 0.475\\n * - all layers\\n - 8\\n - 16\\n - 15.87 GB\\n - 0.508\\n * - Q and V only\\n - 64\\n - 128\\n - 15.86 GB\\n - 0.504\\n * - all layers\\n - 64\\n - 128\\n - 17.04 GB\\n - **0.514**\\n\\nWe can see that our baseline settings give the lowest peak memory, but our evaluation performance is relatively lower.\\nBy enabling LoRA for all linear layers and increasing the rank to 64, we see almost a 4% absolute improvement\\nin our accuracy on this task, but our peak memory also increases by about 1.4GB. These are just a couple simple\\nexperiments; we encourage you to run your own finetunes to find the right tradeoff for your particular setup.\\n\\nAdditionally, if you want to decrease your model\\'s peak memory even further \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand still potentially achieve similar\\nmodel quality results\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, you can check out our :ref:`QLoRA tutorial\u001b[0m\u001b[32m`.\\n'\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33moutput_message\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33msession_id\u001b[0m=\u001b[32m'6910f07f-f8e0-407b-8441-60a90e7b1834'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m16\u001b[0m, \u001b[1;36m883581\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[35mdatetime\u001b[0m.timezone.utc\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33msteps\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;35mInferenceStep\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mapi_model_response\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The `bfloat16` format uses 2 bytes per model parameter, which is half the memory of `fp32`, and also improves training speed.'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'49409ea3-4a4d-4433-aa71-e6e4ec1bb054'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'inference'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'212541bc-0cfa-4f04-a8a5-25fe2892bc8f'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m144218\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m17\u001b[0m, \u001b[1;36m267803\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'212541bc-0cfa-4f04-a8a5-25fe2892bc8f'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m22\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m29\u001b[0m, \u001b[1;36m19\u001b[0m, \u001b[1;36m155387\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0mUTC\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33moutput_attachments\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "simple_session_id = simple_agent.create_session(session_name=f\"simple_session_{uuid.uuid4()}\")\n", - "response = simple_agent.create_turn(\n", - " messages=[\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": \"What precision formats does torchtune support?\"\n", - " }\n", - " ],\n", - " documents=attachments,\n", - " session_id=simple_session_id,\n", - " stream=False\n", - " )\n", - "\n", - "pprint(response)\n", - "\n", - "session_response = client.agents.session.retrieve(agent_id=simple_agent.agent_id, session_id=simple_session_id)\n", - "pprint(session_response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "master", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}