Introduce Llama stack distributions (#22)

* Add distribution CLI scaffolding * More progress towards `llama distribution install` * getting closer to a distro definition, distro install + configure works * Distribution server now functioning * read existing configuration, save enums properly * Remove inference uvicorn server entrypoint and llama inference CLI command * updated dependency and client model name * Improved exception handling * local imports for faster cli * undo a typo, add a passthrough distribution * implement full-passthrough in the server * add safety adapters, configuration handling, server + clients * cleanup, moving stuff to common, nuke utils * Add a Path() wrapper at the earliest place * fixes * Bring agentic system api to toolchain Add adapter dependencies and resolve adapters using a topological sort * refactor to reduce size of `agentic_system` * move straggler files and fix some important existing bugs * ApiSurface -> Api * refactor a method out * Adapter -> Provider * Make each inference provider into its own subdirectory * installation fixes * Rename Distribution -> DistributionSpec, simplify RemoteProviders * dict key instead of attr * update inference config to take model and not model_dir * Fix passthrough streaming, send headers properly not part of body :facepalm * update safety to use model sku ids and not model dirs * Update cli_reference.md * minor fixes * add DistributionConfig, fix a bug in model download * Make install + start scripts do proper configuration automatically * Update CLI_reference * Nuke fp8_requirements, fold fbgemm into common requirements * Update README, add newline between API surface configurations * Refactor download functionality out of the Command so can be reused * Add `llama model download` alias for `llama download` * Show message about checksum file so users can check themselves * Simpler intro statements * get ollama working * Reduce a bunch of dependencies from toolchain Some improvements to the distribution install script * Avoid using `conda run` since it buffers everything * update dependencies and rely on LLAMA_TOOLCHAIN_DIR for dev purposes * add validation for configuration input * resort imports * make optional subclasses default to yes for configuration * Remove additional_pip_packages; move deps to providers * for inline make 8b model the default * Add scripts to MANIFEST * allow installing from test.pypi.org * Fix #2 to help with testing packages * Must install llama-models at that same version first * fix PIP_ARGS --------- Co-authored-by: Hardik Shah <hjshah@fb.com> Co-authored-by: Hardik Shah <hjshah@meta.com>
2024-08-08 13:38:41 -07:00 · 2024-08-08 13:38:41 -07:00 · e830814399
commit e830814399
parent da4645a27a
115 changed files with 5839 additions and 1120 deletions
--- a/llama_toolchain/cli/download.py
+++ b/llama_toolchain/cli/download.py
@ -9,26 +9,14 @@ import asyncio
 import os
 import shutil
 import time
+from functools import partial
 from pathlib import Path

 import httpx

-from huggingface_hub import snapshot_download
-from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
-
-from llama_models.datatypes import Model
-from llama_models.sku_list import (
-    all_registered_models,
-    llama_meta_net_info,
-    resolve_model,
-)
 from termcolor import cprint

 from llama_toolchain.cli.subcommand import Subcommand
-from llama_toolchain.utils import DEFAULT_DUMP_DIR
-
-
-DEFAULT_CHECKPOINT_DIR = os.path.join(DEFAULT_DUMP_DIR, "checkpoints")


 class Download(Subcommand):
@ -42,107 +30,130 @@ class Download(Subcommand):
            description="Download a model from llama.meta.comf or HuggingFace hub",
            formatter_class=argparse.RawTextHelpFormatter,
        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_download_cmd)
+        setup_download_parser(self.parser)

-    def _add_arguments(self):
-        models = all_registered_models()
-        self.parser.add_argument(
-            "--source",
-            choices=["meta", "huggingface"],
-            required=True,
-        )
-        self.parser.add_argument(
-            "--model-id",
-            choices=[x.descriptor() for x in models],
-            required=True,
-        )
-        self.parser.add_argument(
-            "--hf-token",
-            type=str,
-            required=False,
-            default=None,
-            help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
-        )
-        self.parser.add_argument(
-            "--meta-url",
-            type=str,
-            required=False,
-            help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
-        )
-        self.parser.add_argument(
-            "--ignore-patterns",
-            type=str,
-            required=False,
-            default="*.safetensors",
-            help="""
+
+def setup_download_parser(parser: argparse.ArgumentParser) -> None:
+    from llama_models.sku_list import all_registered_models
+
+    models = all_registered_models()
+    parser.add_argument(
+        "--source",
+        choices=["meta", "huggingface"],
+        required=True,
+    )
+    parser.add_argument(
+        "--model-id",
+        choices=[x.descriptor() for x in models],
+        required=True,
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        required=False,
+        default=None,
+        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
+    )
+    parser.add_argument(
+        "--meta-url",
+        type=str,
+        required=False,
+        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
+    )
+    parser.add_argument(
+        "--ignore-patterns",
+        type=str,
+        required=False,
+        default="*.safetensors",
+        help="""
 For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
 safetensors files to avoid downloading duplicate weights.
 """,
+    )
+    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
+
+
+def _hf_download(
+    model: "Model",
+    hf_token: str,
+    ignore_patterns: str,
+    parser: argparse.ArgumentParser,
+):
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+
+    from llama_toolchain.common.model_utils import model_local_dir
+
+    repo_id = model.huggingface_repo
+    if repo_id is None:
+        raise ValueError(f"No repo id found for model {model.descriptor()}")
+
+    output_dir = model_local_dir(model)
+    os.makedirs(output_dir, exist_ok=True)
+    try:
+        true_output_dir = snapshot_download(
+            repo_id,
+            local_dir=output_dir,
+            ignore_patterns=ignore_patterns,
+            token=hf_token,
+            library_name="llama-toolchain",
        )
+    except GatedRepoError:
+        parser.error(
+            "It looks like you are trying to access a gated repository. Please ensure you "
+            "have access to the repository and have provided the proper Hugging Face API token "
+            "using the option `--hf-token` or by running `huggingface-cli login`."
+            "You can find your token by visiting https://huggingface.co/settings/tokens"
+        )
+    except RepositoryNotFoundError:
+        parser.error(f"Repository '{args.repo_id}' not found on the Hugging Face Hub.")
+    except Exception as e:
+        parser.error(e)

-    def _hf_download(self, model: Model, hf_token: str, ignore_patterns: str):
-        repo_id = model.huggingface_repo
-        if repo_id is None:
-            raise ValueError(f"No repo id found for model {model.descriptor()}")
+    print(f"\nSuccessfully downloaded model to {true_output_dir}")

-        output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
-        os.makedirs(output_dir, exist_ok=True)
-        try:
-            true_output_dir = snapshot_download(
-                repo_id,
-                local_dir=output_dir,
-                ignore_patterns=ignore_patterns,
-                token=hf_token,
-                library_name="llama-toolchain",
+
+def _meta_download(model: "Model", meta_url: str):
+    from llama_models.sku_list import llama_meta_net_info
+
+    from llama_toolchain.common.model_utils import model_local_dir
+
+    output_dir = Path(model_local_dir(model))
+    os.makedirs(output_dir, exist_ok=True)
+
+    info = llama_meta_net_info(model)
+
+    # I believe we can use some concurrency here if needed but not sure it is worth it
+    for f in info.files:
+        output_file = str(output_dir / f)
+        url = meta_url.replace("*", f"{info.folder}/{f}")
+        total_size = info.pth_size if "consolidated" in f else 0
+        cprint(f"Downloading `{f}`...", "white")
+        downloader = ResumableDownloader(url, output_file, total_size)
+        asyncio.run(downloader.download())
+
+    print(f"\nSuccessfully downloaded model to {output_dir}")
+    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+
+
+def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    from llama_models.sku_list import resolve_model
+
+    model = resolve_model(args.model_id)
+    if model is None:
+        parser.error(f"Model {args.model_id} not found")
+        return
+
+    if args.source == "huggingface":
+        _hf_download(model, args.hf_token, args.ignore_patterns, parser)
+    else:
+        meta_url = args.meta_url
+        if not meta_url:
+            meta_url = input(
+                "Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
            )
-        except GatedRepoError:
-            self.parser.error(
-                "It looks like you are trying to access a gated repository. Please ensure you "
-                "have access to the repository and have provided the proper Hugging Face API token "
-                "using the option `--hf-token` or by running `huggingface-cli login`."
-                "You can find your token by visiting https://huggingface.co/settings/tokens"
-            )
-        except RepositoryNotFoundError:
-            self.parser.error(
-                f"Repository '{args.repo_id}' not found on the Hugging Face Hub."
-            )
-        except Exception as e:
-            self.parser.error(e)
-
-        print(f"Successfully downloaded model to {true_output_dir}")
-
-    def _meta_download(self, model: Model, meta_url: str):
-        output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
-        os.makedirs(output_dir, exist_ok=True)
-
-        info = llama_meta_net_info(model)
-
-        # I believe we can use some concurrency here if needed but not sure it is worth it
-        for f in info.files:
-            output_file = str(output_dir / f)
-            url = meta_url.replace("*", f"{info.folder}/{f}")
-            total_size = info.pth_size if "consolidated" in f else 0
-            cprint(f"Downloading `{f}`...", "white")
-            downloader = ResumableDownloader(url, output_file, total_size)
-            asyncio.run(downloader.download())
-
-    def _run_download_cmd(self, args: argparse.Namespace):
-        model = resolve_model(args.model_id)
-        if model is None:
-            self.parser.error(f"Model {args.model_id} not found")
-            return
-
-        if args.source == "huggingface":
-            self._hf_download(model, args.hf_token, args.ignore_patterns)
-        else:
-            meta_url = args.meta_url
-            if not meta_url:
-                meta_url = input(
-                    "Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
-                )
-                assert meta_url is not None and "llama3-1.llamameta.net" in meta_url
-            self._meta_download(model, meta_url)
+            assert meta_url is not None and "llama3-1.llamameta.net" in meta_url
+        _meta_download(model, meta_url)


 class ResumableDownloader: