From 142b36c7c53f851c6fdde83e0618d4669abf3147 Mon Sep 17 00:00:00 2001
From: dltn <6599399+dltn@users.noreply.github.com>
Date: Thu, 25 Jul 2024 12:37:05 -0700
Subject: [PATCH] Add CLI reference doc

---
 README.md                    |   7 +-
 docs/cli_reference.md        | 144 +++++++++++++++++++++++++++++++++++
 llama_toolchain/cli/llama.py |   2 +-
 3 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 docs/cli_reference.md
diff --git a/README.md b/README.md
index cf5031263..7d2c15e7c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,12 @@
 # llama-toolchain
 
 This repo contains the API specifications for various components of the Llama Stack as well implementations for some of those APIs like model inference.
-The Stack consists of toolchain-apis and agentic-apis. This repo contains the toolchain-apis
+
+The Llama Stack consists of toolchain-apis and agentic-apis. This repo contains the toolchain-apis.
 
 ## Installation
 
-You can install this repository as a [package](https://pypi.org/project/llama-toolchain/) by just doing `pip install llama-toolchain`
+You can install this repository as a [package](https://pypi.org/project/llama-toolchain/) with `pip install llama-toolchain`
 
 If you want to install from source:
 
@@ -29,7 +30,7 @@ llama --help
 
 usage: llama [-h] {download,inference,model,agentic_system} ...
 
-Welcome to the LLama cli
+Welcome to the llama CLI
 
 options:
   -h, --help            show this help message and exit
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
new file mode 100644
index 000000000..81c7d3584
--- /dev/null
+++ b/docs/cli_reference.md
@@ -0,0 +1,144 @@
+# Llama CLI Reference
+
+The `llama` CLI tool helps you setup and use the Llama toolchain & agentic systems. It should be available on your path after installing the `llama-toolchain` package.
+
+```
+$ llama --help
+
+Welcome to the Llama Command Line Interface
+
+Usage: llama [-h] {download,inference,model} ...
+
+
+Options:
+  -h, --help            Show this help message and exit
+
+
+Subcommands:
+  {download,inference,model}
+```
+
+## Step 1. Get the models
+
+First, you need models locally. You can get the models from [HuggingFace](https://huggingface.co/meta-llama) or [directly from Meta](https://llama.meta.com/llama-downloads/). The download command streamlines the process.
+
+1. Create and get a Hugging Face access token [here](https://huggingface.co/settings/tokens)
+2. Set the `HF_TOKEN` environment variable
+
+```
+export HF_TOKEN=YOUR_TOKEN_HERE
+llama download meta-llama/Meta-Llama-3.1-70B-Instruct
+```
+
+Run `llama download --help` for more information.
+
+
+## Step 2: Understand the models
+The `llama model` command helps you explore the model’s interface.
+
+```
+$ llama model --help
+usage: llama model [-h] {template} ...
+
+
+Describe llama model interfaces
+
+
+options:
+  -h, --help  show this help message and exit
+
+
+model_subcommands:
+  {template}
+
+
+Example: llama model <subcommand> <options>
+```
+
+You can run `llama model template` see all of the templates and their tokens:
+
+
+```
+$ llama model template
+
+
+system-message-builtin-and-custom-tools
+system-message-builtin-tools-only
+system-message-custom-tools-only
+system-message-default
+assistant-message-builtin-tool-call
+assistant-message-custom-tool-call
+assistant-message-default
+tool-message-failure
+tool-message-success
+user-message-default
+```
+
+And fetch an example by passing it to `--template`:
+
+```
+llama model template --template tool-message-success
+
+
+llama model template --template tool-message-success
+<|start_header_id|>ipython<|end_header_id|>
+
+
+completed
+[stdout]{"results":["something something"]}[/stdout]<|eot_id|>
+```
+
+## Step 3. Start the inference server
+
+Once you have a model, the magic begins with inference. The `llama inference` command can help you configure and launch the Llama Stack inference server.
+
+```
+$ llama inference --help
+
+
+usage: llama inference [-h] {start,configure} ...
+
+
+Run inference on a llama model
+
+
+options:
+  -h, --help         show this help message and exit
+
+
+inference_subcommands:
+  {start,configure}
+
+
+Example: llama inference start <options>
+```
+
+Run `llama inference configure` to setup your configuration at `~/.llama/configs/inference.yaml`. You’ll set up variables like:
+
+
+* the directory where you stored the models you downloaded from step 1
+* the model parallel size (1 for 8B models, 8 for 70B/405B)
+
+
+Once you’ve configured the inference server, run `llama inference start`. The model will load into GPU and you’ll be able to send requests once you see the server ready.
+
+
+If you want to use a different model, re-run `llama inference configure` to update the model path and llama inference start to start again.
+
+
+Run `llama inference --help` for more information.
+
+
+## Step 4. Start the agentic system
+
+The `llama agentic_system` command helps you configure and launch agentic systems. The `llama agentic_system configure` command sets up the configuration file the agentic code expects, and the `llama agentic_system start_app` command streamlines launching.
+
+
+For example, let’s run the included chat app:
+
+```
+llama agentic_system configure
+llama agentic_system start_app chat
+```
+
+For more information run `llama agentic_system --help`.
diff --git a/llama_toolchain/cli/llama.py b/llama_toolchain/cli/llama.py
index b29bad4af..5bdb7ca59 100644
--- a/llama_toolchain/cli/llama.py
+++ b/llama_toolchain/cli/llama.py
@@ -17,7 +17,7 @@ class LlamaCLIParser:
     def __init__(self):
         self.parser = argparse.ArgumentParser(
             prog="llama",
-            description="Welcome to the LLama cli",
+            description="Welcome to the Llama CLI",
             add_help=True,
         )