API Updates (#73)

* API Keys passed from Client instead of distro configuration * delete distribution registry * Rename the "package" word away * Introduce a "Router" layer for providers Some providers need to be factorized and considered as thin routing layers on top of other providers. Consider two examples: - The inference API should be a routing layer over inference providers, routed using the "model" key - The memory banks API is another instance where various memory bank types will be provided by independent providers (e.g., a vector store is served by Chroma while a keyvalue memory can be served by Redis or PGVector) This commit introduces a generalized routing layer for this purpose. * update `apis_to_serve` * llama_toolchain -> llama_stack * Codemod from llama_toolchain -> llama_stack - added providers/registry - cleaned up api/ subdirectories and moved impls away - restructured api/api.py - from llama_stack.apis.<api> import foo should work now - update imports to do llama_stack.apis.<api> - update many other imports - added __init__, fixed some registry imports - updated registry imports - create_agentic_system -> create_agent - AgenticSystem -> Agent * Moved some stuff out of common/; re-generated OpenAPI spec * llama-toolchain -> llama-stack (hyphens) * add control plane API * add redis adapter + sqlite provider * move core -> distribution * Some more toolchain -> stack changes * small naming shenanigans * Removing custom tool and agent utilities and moving them client side * Move control plane to distribution server for now * Remove control plane from API list * no codeshield dependency randomly plzzzzz * Add "fire" as a dependency * add back event loggers * stack configure fixes * use brave instead of bing in the example client * add init file so it gets packaged * add init files so it gets packaged * Update MANIFEST * bug fix --------- Co-authored-by: Hardik Shah <hjshah@fb.com> Co-authored-by: Xi Yan <xiyan@meta.com> Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
2024-09-17 19:51:35 -07:00 · 2024-09-17 19:51:35 -07:00 · 9487ad8294
commit 9487ad8294
parent f294eac5f5
213 changed files with 1725 additions and 1204 deletions
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/init.py
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+if [[ $# -ne 1 ]]; then
+    echo "Error: Please provide the name of CONDA environment you wish to create"
+    exit 1
+fi
+
+ENV_NAME=$1
+
+set -eu
+eval "$(conda shell.bash hook)"
+
+echo "Will build env (or overwrite) named '$ENV_NAME'"
+
+set -x
+
+run_build() {
+    # Set up the conda environment
+    yes | conda remove --name $ENV_NAME --all
+    yes | conda create -n $ENV_NAME python=3.10
+    conda activate $ENV_NAME
+
+    # PT nightly
+    pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+
+    # install dependencies for `llama-agentic-system`
+    pip install -r fp8_requirements.txt
+}
+
+run_build
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py
@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+
+import fire
+
+import torch
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from fp8.fp8_impls import FfnQuantizeMode, quantize_fp8
+
+from llama.model import ModelArgs, Transformer, TransformerBlock
+from llama.tokenizer import Tokenizer
+from torch.nn.parameter import Parameter
+
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    quantized_ckpt_dir: str,
+    max_seq_len: Optional[int] = 512,
+    max_batch_size: Optional[int] = 4,
+    model_parallel_size: Optional[int] = None,
+    ffn_quantize_mode: Optional[FfnQuantizeMode] = FfnQuantizeMode.FP8_ROWWISE,
+    fp8_activation_scale_ub: Optional[float] = 1200.0,
+    seed: int = 1,
+):
+    """ """
+    if not os.path.exists(quantized_ckpt_dir):
+        os.makedirs(quantized_ckpt_dir)
+        shutil.copy(
+            os.path.join(ckpt_dir, "params.json"),
+            os.path.join(quantized_ckpt_dir, "params.json"),
+        )
+        shutil.copy(
+            os.path.join(ckpt_dir, "tokenizer.model"),
+            os.path.join(quantized_ckpt_dir, "tokenizer.model"),
+        )
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group("nccl")
+        if not model_parallel_is_initialized():
+            if model_parallel_size is None:
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
+            initialize_model_parallel(model_parallel_size)
+
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+
+        # seed must be the same in all processes
+        torch.manual_seed(seed)
+
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            **params,
+        )
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        assert (
+            model_args.vocab_size == tokenizer.n_words
+        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
+
+        # load on CPU in bf16 so that fp8 conversion does not find an unexpected (fp32, e.g.) datatype
+        torch.set_default_tensor_type(torch.BFloat16Tensor)
+
+        model = Transformer(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+
+        if torch.cuda.is_bf16_supported():
+            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+        else:
+            torch.set_default_tensor_type(torch.cuda.HalfTensor)
+
+        print(ckpt_path)
+        assert (
+            quantized_ckpt_dir is not None
+        ), "QUantized checkpoint directory should not be None"
+        fp8_scales = {}
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w1.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w1.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w1_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w3.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w3.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w3_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w2.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w2.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w2_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+        fp8_scales_path = os.path.join(
+            quantized_ckpt_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
+        )
+        torch.save(fp8_scales, fp8_scales_path)
+
+        ckpt_path = os.path.join(
+            quantized_ckpt_dir,
+            "consolidated.{:02d}.pth".format(get_model_parallel_rank()),
+        )
+        torch.save(model.state_dict(), ckpt_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/run_quantize_checkpoint.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+set -x
+
+cd $(git rev-parse --show-toplevel)
+
+MASTER_HOST=$1
+RUN_ID=$2
+CKPT_DIR=$3
+QUANT_CKPT_DIR=$4
+TOKENIZER_PATH=$5
+NNODES=$6
+NPROC=$7
+
+echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
+
+NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" \
+  torchrun \
+   --nnodes=$NNODES --nproc_per_node=$NPROC \
+   --rdzv_id=$RUN_ID \
+   --rdzv_conf='timeout=120' \
+   --rdzv_backend=c10d \
+   --rdzv_endpoint="${MASTER_HOST}:29502" \
+   quantize_checkpoint.py $CKPT_DIR $TOKENIZER_PATH $QUANT_CKPT_DIR