Introduce Llama stack distributions (#22)

* Add distribution CLI scaffolding * More progress towards `llama distribution install` * getting closer to a distro definition, distro install + configure works * Distribution server now functioning * read existing configuration, save enums properly * Remove inference uvicorn server entrypoint and llama inference CLI command * updated dependency and client model name * Improved exception handling * local imports for faster cli * undo a typo, add a passthrough distribution * implement full-passthrough in the server * add safety adapters, configuration handling, server + clients * cleanup, moving stuff to common, nuke utils * Add a Path() wrapper at the earliest place * fixes * Bring agentic system api to toolchain Add adapter dependencies and resolve adapters using a topological sort * refactor to reduce size of `agentic_system` * move straggler files and fix some important existing bugs * ApiSurface -> Api * refactor a method out * Adapter -> Provider * Make each inference provider into its own subdirectory * installation fixes * Rename Distribution -> DistributionSpec, simplify RemoteProviders * dict key instead of attr * update inference config to take model and not model_dir * Fix passthrough streaming, send headers properly not part of body :facepalm * update safety to use model sku ids and not model dirs * Update cli_reference.md * minor fixes * add DistributionConfig, fix a bug in model download * Make install + start scripts do proper configuration automatically * Update CLI_reference * Nuke fp8_requirements, fold fbgemm into common requirements * Update README, add newline between API surface configurations * Refactor download functionality out of the Command so can be reused * Add `llama model download` alias for `llama download` * Show message about checksum file so users can check themselves * Simpler intro statements * get ollama working * Reduce a bunch of dependencies from toolchain Some improvements to the distribution install script * Avoid using `conda run` since it buffers everything * update dependencies and rely on LLAMA_TOOLCHAIN_DIR for dev purposes * add validation for configuration input * resort imports * make optional subclasses default to yes for configuration * Remove additional_pip_packages; move deps to providers * for inline make 8b model the default * Add scripts to MANIFEST * allow installing from test.pypi.org * Fix #2 to help with testing packages * Must install llama-models at that same version first * fix PIP_ARGS --------- Co-authored-by: Hardik Shah <hjshah@fb.com> Co-authored-by: Hardik Shah <hjshah@meta.com>
2024-08-08 13:38:41 -07:00 · 2024-08-08 13:38:41 -07:00 · e830814399
commit e830814399
parent da4645a27a
115 changed files with 5839 additions and 1120 deletions
--- a/llama_toolchain/cli/distribution/init.py
+++ b/llama_toolchain/cli/distribution/init.py
@ -3,3 +3,5 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+from .distribution import DistributionParser  # noqa
--- a/llama_toolchain/cli/distribution/configure.py
+++ b/llama_toolchain/cli/distribution/configure.py
@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import json
+import shlex
+
+import yaml
+
+from llama_toolchain.cli.subcommand import Subcommand
+from llama_toolchain.common.config_dirs import DISTRIBS_BASE_DIR
+from termcolor import cprint
+
+
+class DistributionConfigure(Subcommand):
+    """Llama cli for configuring llama toolchain configs"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "configure",
+            prog="llama distribution configure",
+            description="configure a llama stack distribution",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_distribution_configure_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="Name of the distribution to configure",
+            required=True,
+        )
+
+    def _run_distribution_configure_cmd(self, args: argparse.Namespace) -> None:
+        from llama_toolchain.distribution.datatypes import DistributionConfig
+        from llama_toolchain.distribution.registry import resolve_distribution_spec
+
+        config_file = DISTRIBS_BASE_DIR / args.name / "config.yaml"
+        if not config_file.exists():
+            self.parser.error(
+                f"Could not find {config_file}. Please run `llama distribution install` first"
+            )
+            return
+
+        # we need to find the spec from the name
+        with open(config_file, "r") as f:
+            config = DistributionConfig(**yaml.safe_load(f))
+
+        dist = resolve_distribution_spec(config.spec)
+        if dist is None:
+            raise ValueError(f"Could not find any registered spec `{config.spec}`")
+
+        configure_llama_distribution(dist, config)
+
+
+def configure_llama_distribution(dist: "Distribution", config: "DistributionConfig"):
+    from llama_toolchain.common.exec import run_command
+    from llama_toolchain.common.prompt_for_config import prompt_for_config
+    from llama_toolchain.common.serialize import EnumEncoder
+    from llama_toolchain.distribution.dynamic import instantiate_class_type
+
+    python_exe = run_command(shlex.split("which python"))
+    # simple check
+    conda_env = config.conda_env
+    if conda_env not in python_exe:
+        raise ValueError(
+            f"Please re-run configure by activating the `{conda_env}` conda environment"
+        )
+
+    if config.providers:
+        cprint(
+            f"Configuration already exists for {config.name}. Will overwrite...",
+            "yellow",
+            attrs=["bold"],
+        )
+
+    for api, provider_spec in dist.provider_specs.items():
+        cprint(f"Configuring API surface: {api.value}", "white", attrs=["bold"])
+        config_type = instantiate_class_type(provider_spec.config_class)
+        provider_config = prompt_for_config(
+            config_type,
+            (
+                config_type(**config.providers[api.value])
+                if api.value in config.providers
+                else None
+            ),
+        )
+        print("")
+
+        config.providers[api.value] = {
+            "provider_id": provider_spec.provider_id,
+            **provider_config.dict(),
+        }
+
+    config_path = DISTRIBS_BASE_DIR / config.name / "config.yaml"
+    with open(config_path, "w") as fp:
+        dist_config = json.loads(json.dumps(config.dict(), cls=EnumEncoder))
+        fp.write(yaml.dump(dist_config, sort_keys=False))
+
+    print(f"YAML configuration has been written to {config_path}")
--- a/llama_toolchain/cli/distribution/create.py
+++ b/llama_toolchain/cli/distribution/create.py
@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_toolchain.cli.subcommand import Subcommand
+
+
+class DistributionCreate(Subcommand):
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "create",
+            prog="llama distribution create",
+            description="create a Llama stack distribution",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_distribution_create_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="Name of the distribution to create",
+            required=True,
+        )
+        # for each Api the user wants to support, we should
+        # get the list of available providers, ask which one the user
+        # wants to pick and then ask for their configuration.
+
+    def _run_distribution_create_cmd(self, args: argparse.Namespace) -> None:
+        from llama_toolchain.distribution.registry import resolve_distribution_spec
+
+        dist = resolve_distribution_spec(args.name)
+        if dist is not None:
+            self.parser.error(f"Distribution with name {args.name} already exists")
+            return
+
+        raise NotImplementedError()
--- a/llama_toolchain/cli/distribution/distribution.py
+++ b/llama_toolchain/cli/distribution/distribution.py
@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_toolchain.cli.subcommand import Subcommand
+
+from .configure import DistributionConfigure
+from .create import DistributionCreate
+from .install import DistributionInstall
+from .list import DistributionList
+from .start import DistributionStart
+
+
+class DistributionParser(Subcommand):
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "distribution",
+            prog="llama distribution",
+            description="Operate on llama stack distributions",
+        )
+
+        subparsers = self.parser.add_subparsers(title="distribution_subcommands")
+
+        # Add sub-commands
+        DistributionList.create(subparsers)
+        DistributionInstall.create(subparsers)
+        DistributionCreate.create(subparsers)
+        DistributionConfigure.create(subparsers)
+        DistributionStart.create(subparsers)
--- a/llama_toolchain/cli/distribution/install.py
+++ b/llama_toolchain/cli/distribution/install.py
@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import os
+
+import pkg_resources
+import yaml
+
+from termcolor import cprint
+
+from llama_toolchain.cli.subcommand import Subcommand
+from llama_toolchain.common.config_dirs import DISTRIBS_BASE_DIR
+
+
+class DistributionInstall(Subcommand):
+    """Llama cli for configuring llama toolchain configs"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "install",
+            prog="llama distribution install",
+            description="Install a llama stack distribution",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_distribution_install_cmd)
+
+    def _add_arguments(self):
+        from llama_toolchain.distribution.registry import available_distribution_specs
+
+        self.parser.add_argument(
+            "--spec",
+            type=str,
+            help="Distribution spec to install (try ollama-inline)",
+            required=True,
+            choices=[d.spec_id for d in available_distribution_specs()],
+        )
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="What should the installation be called locally?",
+            required=True,
+        )
+        self.parser.add_argument(
+            "--conda-env",
+            type=str,
+            help="conda env in which this distribution will run (default = distribution name)",
+        )
+
+    def _run_distribution_install_cmd(self, args: argparse.Namespace) -> None:
+        from llama_toolchain.common.exec import run_with_pty
+        from llama_toolchain.distribution.datatypes import DistributionConfig
+        from llama_toolchain.distribution.distribution import distribution_dependencies
+        from llama_toolchain.distribution.registry import resolve_distribution_spec
+
+        os.makedirs(DISTRIBS_BASE_DIR, exist_ok=True)
+        script = pkg_resources.resource_filename(
+            "llama_toolchain",
+            "distribution/install_distribution.sh",
+        )
+
+        dist = resolve_distribution_spec(args.spec)
+        if dist is None:
+            self.parser.error(f"Could not find distribution {args.spec}")
+            return
+
+        distrib_dir = DISTRIBS_BASE_DIR / args.name
+        os.makedirs(distrib_dir, exist_ok=True)
+
+        deps = distribution_dependencies(dist)
+        if not args.conda_env:
+            print(f"Using {args.name} as the Conda environment for this distribution")
+
+        conda_env = args.conda_env or args.name
+
+        config_file = distrib_dir / "config.yaml"
+        if config_file.exists():
+            c = DistributionConfig(**yaml.safe_load(config_file.read_text()))
+            if c.spec != dist.spec_id:
+                self.parser.error(
+                    f"already installed distribution with `spec={c.spec}` does not match provided spec `{args.spec}`"
+                )
+                return
+            if c.conda_env != conda_env:
+                self.parser.error(
+                    f"already installed distribution has `conda_env={c.conda_env}` different from provided conda env `{conda_env}`"
+                )
+                return
+        else:
+            with open(config_file, "w") as f:
+                c = DistributionConfig(
+                    spec=dist.spec_id,
+                    name=args.name,
+                    conda_env=conda_env,
+                )
+                f.write(yaml.dump(c.dict(), sort_keys=False))
+
+        return_code = run_with_pty([script, conda_env, args.name, " ".join(deps)])
+
+        assert return_code == 0, cprint(
+            f"Failed to install distribution {dist.spec_id}", color="red"
+        )
+        cprint(
+            f"Distribution `{args.name}` (with spec {dist.spec_id}) has been installed successfully!",
+            color="green",
+        )
--- a/llama_toolchain/cli/distribution/list.py
+++ b/llama_toolchain/cli/distribution/list.py
@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import json
+
+from llama_toolchain.cli.subcommand import Subcommand
+
+
+class DistributionList(Subcommand):
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list",
+            prog="llama distribution list",
+            description="Show available llama stack distributions",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_distribution_list_cmd)
+
+    def _add_arguments(self):
+        pass
+
+    def _run_distribution_list_cmd(self, args: argparse.Namespace) -> None:
+        from llama_toolchain.cli.table import print_table
+        from llama_toolchain.distribution.registry import available_distribution_specs
+
+        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
+        headers = [
+            "Spec ID",
+            "ProviderSpecs",
+            "Description",
+        ]
+
+        rows = []
+        for spec in available_distribution_specs():
+            providers = {k.value: v.provider_id for k, v in spec.provider_specs.items()}
+            rows.append(
+                [
+                    spec.spec_id,
+                    json.dumps(providers, indent=2),
+                    spec.description,
+                ]
+            )
+        print_table(
+            rows,
+            headers,
+            separate_rows=True,
+        )
--- a/llama_toolchain/cli/distribution/start.py
+++ b/llama_toolchain/cli/distribution/start.py
@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+import pkg_resources
+import yaml
+
+from llama_toolchain.cli.subcommand import Subcommand
+from llama_toolchain.common.config_dirs import DISTRIBS_BASE_DIR
+
+
+class DistributionStart(Subcommand):
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "start",
+            prog="llama distribution start",
+            description="""start the server for a Llama stack distribution. you should have already installed and configured the distribution""",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_distribution_start_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="Name of the distribution to start",
+            required=True,
+        )
+        self.parser.add_argument(
+            "--port",
+            type=int,
+            help="Port to run the server on. Defaults to 5000",
+            default=5000,
+        )
+        self.parser.add_argument(
+            "--disable-ipv6",
+            action="store_true",
+            help="Disable IPv6 support",
+            default=False,
+        )
+
+    def _run_distribution_start_cmd(self, args: argparse.Namespace) -> None:
+        from llama_toolchain.common.exec import run_with_pty
+        from llama_toolchain.distribution.registry import resolve_distribution_spec
+
+        config_file = DISTRIBS_BASE_DIR / args.name / "config.yaml"
+        if not config_file.exists():
+            self.parser.error(
+                f"Could not find {config_file}. Please run `llama distribution install` first"
+            )
+            return
+
+        # we need to find the spec from the name
+        with open(config_file, "r") as f:
+            config = yaml.safe_load(f)
+
+        dist = resolve_distribution_spec(config["spec"])
+        if dist is None:
+            raise ValueError(f"Could not find any registered spec `{config['spec']}`")
+
+        conda_env = config["conda_env"]
+        if not conda_env:
+            raise ValueError(
+                f"Could not find Conda environment for distribution `{args.name}`"
+            )
+
+        script = pkg_resources.resource_filename(
+            "llama_toolchain",
+            "distribution/start_distribution.sh",
+        )
+        args = [script, conda_env, config_file, "--port", str(args.port)] + (
+            ["--disable-ipv6"] if args.disable_ipv6 else []
+        )
+
+        run_with_pty(args)
--- a/llama_toolchain/cli/download.py
+++ b/llama_toolchain/cli/download.py
@ -9,26 +9,14 @@ import asyncio
 import os
 import shutil
 import time
+from functools import partial
 from pathlib import Path

 import httpx

-from huggingface_hub import snapshot_download
-from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
-
-from llama_models.datatypes import Model
-from llama_models.sku_list import (
-    all_registered_models,
-    llama_meta_net_info,
-    resolve_model,
-)
 from termcolor import cprint

 from llama_toolchain.cli.subcommand import Subcommand
-from llama_toolchain.utils import DEFAULT_DUMP_DIR
-
-
-DEFAULT_CHECKPOINT_DIR = os.path.join(DEFAULT_DUMP_DIR, "checkpoints")


 class Download(Subcommand):
@ -42,107 +30,130 @@ class Download(Subcommand):
            description="Download a model from llama.meta.comf or HuggingFace hub",
            formatter_class=argparse.RawTextHelpFormatter,
        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_download_cmd)
+        setup_download_parser(self.parser)

-    def _add_arguments(self):
-        models = all_registered_models()
-        self.parser.add_argument(
-            "--source",
-            choices=["meta", "huggingface"],
-            required=True,
-        )
-        self.parser.add_argument(
-            "--model-id",
-            choices=[x.descriptor() for x in models],
-            required=True,
-        )
-        self.parser.add_argument(
-            "--hf-token",
-            type=str,
-            required=False,
-            default=None,
-            help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
-        )
-        self.parser.add_argument(
-            "--meta-url",
-            type=str,
-            required=False,
-            help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
-        )
-        self.parser.add_argument(
-            "--ignore-patterns",
-            type=str,
-            required=False,
-            default="*.safetensors",
-            help="""
+
+def setup_download_parser(parser: argparse.ArgumentParser) -> None:
+    from llama_models.sku_list import all_registered_models
+
+    models = all_registered_models()
+    parser.add_argument(
+        "--source",
+        choices=["meta", "huggingface"],
+        required=True,
+    )
+    parser.add_argument(
+        "--model-id",
+        choices=[x.descriptor() for x in models],
+        required=True,
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        required=False,
+        default=None,
+        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
+    )
+    parser.add_argument(
+        "--meta-url",
+        type=str,
+        required=False,
+        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
+    )
+    parser.add_argument(
+        "--ignore-patterns",
+        type=str,
+        required=False,
+        default="*.safetensors",
+        help="""
 For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
 safetensors files to avoid downloading duplicate weights.
 """,
+    )
+    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
+
+
+def _hf_download(
+    model: "Model",
+    hf_token: str,
+    ignore_patterns: str,
+    parser: argparse.ArgumentParser,
+):
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+
+    from llama_toolchain.common.model_utils import model_local_dir
+
+    repo_id = model.huggingface_repo
+    if repo_id is None:
+        raise ValueError(f"No repo id found for model {model.descriptor()}")
+
+    output_dir = model_local_dir(model)
+    os.makedirs(output_dir, exist_ok=True)
+    try:
+        true_output_dir = snapshot_download(
+            repo_id,
+            local_dir=output_dir,
+            ignore_patterns=ignore_patterns,
+            token=hf_token,
+            library_name="llama-toolchain",
        )
+    except GatedRepoError:
+        parser.error(
+            "It looks like you are trying to access a gated repository. Please ensure you "
+            "have access to the repository and have provided the proper Hugging Face API token "
+            "using the option `--hf-token` or by running `huggingface-cli login`."
+            "You can find your token by visiting https://huggingface.co/settings/tokens"
+        )
+    except RepositoryNotFoundError:
+        parser.error(f"Repository '{args.repo_id}' not found on the Hugging Face Hub.")
+    except Exception as e:
+        parser.error(e)

-    def _hf_download(self, model: Model, hf_token: str, ignore_patterns: str):
-        repo_id = model.huggingface_repo
-        if repo_id is None:
-            raise ValueError(f"No repo id found for model {model.descriptor()}")
+    print(f"\nSuccessfully downloaded model to {true_output_dir}")

-        output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
-        os.makedirs(output_dir, exist_ok=True)
-        try:
-            true_output_dir = snapshot_download(
-                repo_id,
-                local_dir=output_dir,
-                ignore_patterns=ignore_patterns,
-                token=hf_token,
-                library_name="llama-toolchain",
+
+def _meta_download(model: "Model", meta_url: str):
+    from llama_models.sku_list import llama_meta_net_info
+
+    from llama_toolchain.common.model_utils import model_local_dir
+
+    output_dir = Path(model_local_dir(model))
+    os.makedirs(output_dir, exist_ok=True)
+
+    info = llama_meta_net_info(model)
+
+    # I believe we can use some concurrency here if needed but not sure it is worth it
+    for f in info.files:
+        output_file = str(output_dir / f)
+        url = meta_url.replace("*", f"{info.folder}/{f}")
+        total_size = info.pth_size if "consolidated" in f else 0
+        cprint(f"Downloading `{f}`...", "white")
+        downloader = ResumableDownloader(url, output_file, total_size)
+        asyncio.run(downloader.download())
+
+    print(f"\nSuccessfully downloaded model to {output_dir}")
+    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+
+
+def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    from llama_models.sku_list import resolve_model
+
+    model = resolve_model(args.model_id)
+    if model is None:
+        parser.error(f"Model {args.model_id} not found")
+        return
+
+    if args.source == "huggingface":
+        _hf_download(model, args.hf_token, args.ignore_patterns, parser)
+    else:
+        meta_url = args.meta_url
+        if not meta_url:
+            meta_url = input(
+                "Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
            )
-        except GatedRepoError:
-            self.parser.error(
-                "It looks like you are trying to access a gated repository. Please ensure you "
-                "have access to the repository and have provided the proper Hugging Face API token "
-                "using the option `--hf-token` or by running `huggingface-cli login`."
-                "You can find your token by visiting https://huggingface.co/settings/tokens"
-            )
-        except RepositoryNotFoundError:
-            self.parser.error(
-                f"Repository '{args.repo_id}' not found on the Hugging Face Hub."
-            )
-        except Exception as e:
-            self.parser.error(e)
-
-        print(f"Successfully downloaded model to {true_output_dir}")
-
-    def _meta_download(self, model: Model, meta_url: str):
-        output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
-        os.makedirs(output_dir, exist_ok=True)
-
-        info = llama_meta_net_info(model)
-
-        # I believe we can use some concurrency here if needed but not sure it is worth it
-        for f in info.files:
-            output_file = str(output_dir / f)
-            url = meta_url.replace("*", f"{info.folder}/{f}")
-            total_size = info.pth_size if "consolidated" in f else 0
-            cprint(f"Downloading `{f}`...", "white")
-            downloader = ResumableDownloader(url, output_file, total_size)
-            asyncio.run(downloader.download())
-
-    def _run_download_cmd(self, args: argparse.Namespace):
-        model = resolve_model(args.model_id)
-        if model is None:
-            self.parser.error(f"Model {args.model_id} not found")
-            return
-
-        if args.source == "huggingface":
-            self._hf_download(model, args.hf_token, args.ignore_patterns)
-        else:
-            meta_url = args.meta_url
-            if not meta_url:
-                meta_url = input(
-                    "Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
-                )
-                assert meta_url is not None and "llama3-1.llamameta.net" in meta_url
-            self._meta_download(model, meta_url)
+            assert meta_url is not None and "llama3-1.llamameta.net" in meta_url
+        _meta_download(model, meta_url)


 class ResumableDownloader:
--- a/llama_toolchain/cli/inference/configure.py
+++ b/llama_toolchain/cli/inference/configure.py
@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import os
-import textwrap
-
-from pathlib import Path
-
-import pkg_resources
-
-from llama_toolchain.cli.subcommand import Subcommand
-from llama_toolchain.utils import DEFAULT_DUMP_DIR
-
-
-CONFIGS_BASE_DIR = os.path.join(DEFAULT_DUMP_DIR, "configs")
-
-
-class InferenceConfigure(Subcommand):
-    """Llama cli for configuring llama toolchain configs"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "configure",
-            prog="llama inference configure",
-            description="Configure llama toolchain inference configs",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama inference configure
-                """
-            ),
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_inference_configure_cmd)
-
-    def _add_arguments(self):
-        pass
-
-    def read_user_inputs(self):
-        checkpoint_dir = input(
-            "Enter the checkpoint directory for the model (e.g., ~/.llama/checkpoints/Meta-Llama-3-8B/): "
-        )
-        model_parallel_size = input(
-            "Enter model parallel size (e.g., 1 for 8B / 8 for 70B and 405B): "
-        )
-        assert model_parallel_size.isdigit() and int(model_parallel_size) in {
-            1,
-            8,
-        }, "model parallel size must be 1 or 8"
-
-        return checkpoint_dir, model_parallel_size
-
-    def write_output_yaml(self, checkpoint_dir, model_parallel_size, yaml_output_path):
-        default_conf_path = pkg_resources.resource_filename(
-            "llama_toolchain", "data/default_inference_config.yaml"
-        )
-        with open(default_conf_path, "r") as f:
-            yaml_content = f.read()
-
-        yaml_content = yaml_content.format(
-            checkpoint_dir=checkpoint_dir,
-            model_parallel_size=model_parallel_size,
-        )
-
-        with open(yaml_output_path, "w") as yaml_file:
-            yaml_file.write(yaml_content.strip())
-
-        print(f"YAML configuration has been written to {yaml_output_path}")
-
-    def _run_inference_configure_cmd(self, args: argparse.Namespace) -> None:
-        checkpoint_dir, model_parallel_size = self.read_user_inputs()
-        checkpoint_dir = os.path.expanduser(checkpoint_dir)
-
-        assert (
-            Path(checkpoint_dir).exists() and Path(checkpoint_dir).is_dir()
-        ), f"{checkpoint_dir} does not exist or it not a directory"
-
-        os.makedirs(CONFIGS_BASE_DIR, exist_ok=True)
-        yaml_output_path = Path(CONFIGS_BASE_DIR) / "inference.yaml"
-
-        self.write_output_yaml(
-            checkpoint_dir,
-            model_parallel_size,
-            yaml_output_path,
-        )
--- a/llama_toolchain/cli/inference/inference.py
+++ b/llama_toolchain/cli/inference/inference.py
@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import textwrap
-
-from llama_toolchain.cli.inference.configure import InferenceConfigure
-from llama_toolchain.cli.inference.start import InferenceStart
-from llama_toolchain.cli.subcommand import Subcommand
-
-
-class InferenceParser(Subcommand):
-    """Llama cli for inference apis"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "inference",
-            prog="llama inference",
-            description="Run inference on a llama model",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama inference start <options>
-                """
-            ),
-        )
-
-        subparsers = self.parser.add_subparsers(title="inference_subcommands")
-
-        # Add sub-commandsa
-        InferenceStart.create(subparsers)
-        InferenceConfigure.create(subparsers)
--- a/llama_toolchain/cli/inference/start.py
+++ b/llama_toolchain/cli/inference/start.py
@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import textwrap
-
-from llama_toolchain.cli.subcommand import Subcommand
-
-from llama_toolchain.inference.server import main as inference_server_init
-
-
-class InferenceStart(Subcommand):
-    """Llama Inference cli for starting inference server"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "start",
-            prog="llama inference start",
-            description="Start an inference server",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama inference start <options>
-                """
-            ),
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_inference_start_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "--port",
-            type=int,
-            help="Port to run the server on. Defaults to 5000",
-            default=5000,
-        )
-        self.parser.add_argument(
-            "--disable-ipv6",
-            action="store_true",
-            help="Disable IPv6 support",
-            default=False,
-        )
-        self.parser.add_argument(
-            "--config", type=str, help="Path to config file", default="inference"
-        )
-
-    def _run_inference_start_cmd(self, args: argparse.Namespace) -> None:
-        inference_server_init(
-            config_path=args.config,
-            port=args.port,
-            disable_ipv6=args.disable_ipv6,
-        )
--- a/llama_toolchain/cli/llama.py
+++ b/llama_toolchain/cli/llama.py
@ -6,9 +6,9 @@

 import argparse

-from llama_toolchain.cli.download import Download
-from llama_toolchain.cli.inference.inference import InferenceParser
-from llama_toolchain.cli.model.model import ModelParser
+from .distribution import DistributionParser
+from .download import Download
+from .model import ModelParser


 class LlamaCLIParser:
@ -28,8 +28,8 @@ class LlamaCLIParser:

        # Add sub-commands
        Download.create(subparsers)
-        InferenceParser.create(subparsers)
        ModelParser.create(subparsers)
+        DistributionParser.create(subparsers)

        # Import sub-commands from agentic_system if they exist
        try:
--- a/llama_toolchain/cli/model/init.py
+++ b/llama_toolchain/cli/model/init.py
@ -3,3 +3,5 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+from .model import ModelParser  # noqa
--- a/llama_toolchain/cli/model/describe.py
+++ b/llama_toolchain/cli/model/describe.py
@ -7,21 +7,13 @@
 import argparse
 import json

-from enum import Enum
-
 from llama_models.sku_list import resolve_model

 from termcolor import colored

 from llama_toolchain.cli.subcommand import Subcommand
 from llama_toolchain.cli.table import print_table
-
-
-class EnumEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, Enum):
-            return obj.value
-        return super().default(obj)
+from llama_toolchain.common.serialize import EnumEncoder


 class ModelDescribe(Subcommand):
@ -57,9 +49,9 @@ class ModelDescribe(Subcommand):
        rows = [
            (
                colored("Model", "white", attrs=["bold"]),
-                colored(model.sku.value, "white", attrs=["bold"]),
+                colored(model.descriptor(), "white", attrs=["bold"]),
            ),
-            ("HuggingFace ID", model.huggingface_id or "<Not Available>"),
+            ("HuggingFace ID", model.huggingface_repo or "<Not Available>"),
            ("Description", model.description_markdown),
            ("Context Length", f"{model.max_seq_length // 1024}K tokens"),
            ("Weights format", model.quantization_format.value),
--- a/llama_toolchain/cli/model/download.py
+++ b/llama_toolchain/cli/model/download.py
@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_toolchain.cli.subcommand import Subcommand
+
+
+class ModelDownload(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "download",
+            prog="llama model download",
+            description="Download a model from llama.meta.comf or HuggingFace hub",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+
+        from llama_toolchain.cli.download import setup_download_parser
+
+        setup_download_parser(self.parser)
--- a/llama_toolchain/cli/model/model.py
+++ b/llama_toolchain/cli/model/model.py
@ -5,9 +5,9 @@
 # the root directory of this source tree.

 import argparse
-import textwrap

 from llama_toolchain.cli.model.describe import ModelDescribe
+from llama_toolchain.cli.model.download import ModelDownload
 from llama_toolchain.cli.model.list import ModelList
 from llama_toolchain.cli.model.template import ModelTemplate

@ -22,18 +22,13 @@ class ModelParser(Subcommand):
        self.parser = subparsers.add_parser(
            "model",
            prog="llama model",
-            description="Describe llama model interfaces",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama model <subcommand> <options>
-                """
-            ),
+            description="Work with llama models",
        )

        subparsers = self.parser.add_subparsers(title="model_subcommands")

-        # Add sub-commandsa
-        ModelTemplate.create(subparsers)
+        # Add sub-commands
+        ModelDownload.create(subparsers)
        ModelList.create(subparsers)
+        ModelTemplate.create(subparsers)
        ModelDescribe.create(subparsers)
--- a/llama_toolchain/cli/model/template.py
+++ b/llama_toolchain/cli/model/template.py
@ -7,14 +7,9 @@
 import argparse
 import textwrap

-from llama_models.llama3_1.api.interface import (
-    list_jinja_templates,
-    render_jinja_template,
-)
 from termcolor import colored

 from llama_toolchain.cli.subcommand import Subcommand
-from llama_toolchain.cli.table import print_table


 class ModelTemplate(Subcommand):
@ -53,6 +48,12 @@ class ModelTemplate(Subcommand):
        )

    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
+        from llama_models.llama3_1.api.interface import (
+            list_jinja_templates,
+            render_jinja_template,
+        )
+        from llama_toolchain.cli.table import print_table
+
        if args.name:
            template, tokens_info = render_jinja_template(args.name)
            rendered = ""
--- a/llama_toolchain/cli/table.py
+++ b/llama_toolchain/cli/table.py
@ -45,7 +45,7 @@ def format_row(row, col_widths):

 def print_table(rows, headers=None, separate_rows: bool = False):
    def itemlen(item):
-        return len(strip_ansi_colors(item))
+        return max([len(line) for line in strip_ansi_colors(item).split("\n")])

    rows = [[x or "" for x in row] for row in rows]
    if not headers: