From ae671eaf7a04656cf95465f0927b9cc82bcb4bf3 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 28 Oct 2024 17:47:14 -0700
Subject: [PATCH] distro readmes with model serving instructions (#339)

* readme updates

* quantied compose

* dell tgi

* config update

* readme

* update model serving readmes

* update

* update

* config
---
 distributions/fireworks/README.md          | 26 +++++++++++++++++++++-
 distributions/fireworks/run.yaml           |  5 +++++
 distributions/meta-reference-gpu/README.md | 16 +++++++++++++
 distributions/meta-reference-gpu/run.yaml  |  9 ++++++++
 distributions/ollama/README.md             | 25 +++++++++++++++++++++
 distributions/tgi/README.md                | 23 +++++++++++++++++++
 distributions/together/README.md           | 23 +++++++++++++++++++
 distributions/together/run.yaml            | 13 ++++++++---
 8 files changed, 136 insertions(+), 4 deletions(-)
diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md
index b5bdd9c17..a753de429 100644
--- a/distributions/fireworks/README.md
+++ b/distributions/fireworks/README.md
@@ -43,7 +43,7 @@ inference:
     provider_type: remote::fireworks
     config:
       url: https://api.fireworks.ai/inference
-      api_key: <optional api key>
+      api_key: <enter your api key>
 ```
 
 **Via Conda**
@@ -53,3 +53,27 @@ llama stack build --template fireworks --image-type conda
 # -- modify run.yaml to a valid Fireworks server endpoint
 llama stack run ./run.yaml
 ```
+
+### Model Serving
+
+Use `llama-stack-client models list` to chekc the available models served by Fireworks.
+```
+$ llama-stack-client models list
++------------------------------+------------------------------+---------------+------------+
+| identifier                   | llama_model                  | provider_id   | metadata   |
++==============================+==============================+===============+============+
+| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
++------------------------------+------------------------------+---------------+------------+
+```
diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml
index c48b0cb7b..4363d86f3 100644
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@@ -17,6 +17,7 @@ providers:
     provider_type: remote::fireworks
     config:
       url: https://api.fireworks.ai/inference
+      # api_key: <ENTER_YOUR_API_KEY>
   safety:
   - provider_id: meta0
     provider_type: meta-reference
@@ -32,6 +33,10 @@ providers:
   - provider_id: meta0
     provider_type: meta-reference
     config: {}
+  # Uncomment to use weaviate memory provider
+  # - provider_id: weaviate0
+  #   provider_type: remote::weaviate
+  #   config: {}
   agents:
   - provider_id: meta0
     provider_type: meta-reference
diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md
index 677ea71e5..d4c49aff7 100644
--- a/distributions/meta-reference-gpu/README.md
+++ b/distributions/meta-reference-gpu/README.md
@@ -84,3 +84,19 @@ memory:
 ```
 
 3. Run `docker compose up` with the updated `run.yaml` file.
+
+### Serving a new model
+You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`.
+```
+inference:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      model: Llama3.2-11B-Vision-Instruct
+      quantization: null
+      torch_seed: null
+      max_seq_len: 4096
+      max_batch_size: 1
+```
+
+Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml
index 724ca030a..9bf7655f9 100644
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@@ -36,6 +36,15 @@ providers:
   - provider_id: meta0
     provider_type: meta-reference
     config: {}
+  # Uncomment to use pgvector
+  # - provider_id: pgvector
+  #   provider_type: remote::pgvector
+  #   config:
+  #     host: 127.0.0.1
+  #     port: 5432
+  #     db: postgres
+  #     user: postgres
+  #     password: mysecretpassword
   agents:
   - provider_id: meta0
     provider_type: meta-reference
diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md
index ad311c4a6..0d2ce6973 100644
--- a/distributions/ollama/README.md
+++ b/distributions/ollama/README.md
@@ -89,3 +89,28 @@ inference:
 llama stack build --template ollama --image-type conda
 llama stack run ./gpu/run.yaml
 ```
+
+### Model Serving
+
+To serve a new model with `ollama`
+```
+ollama run <model_name>
+```
+
+To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
+```
+$ ollama ps
+
+NAME                         ID              SIZE     PROCESSOR    UNTIL
+llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
+```
+
+To verify that the model served by ollama is correctly connected to Llama Stack server
+```
+$ llama-stack-client models list
++----------------------+----------------------+---------------+-----------------------------------------------+
+| identifier           | llama_model          | provider_id   | metadata                                      |
++======================+======================+===============+===============================================+
+| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
++----------------------+----------------------+---------------+-----------------------------------------------+
+```
diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md
index 0ea6eec5d..f274f8ff0 100644
--- a/distributions/tgi/README.md
+++ b/distributions/tgi/README.md
@@ -92,3 +92,26 @@ llama stack build --template tgi --image-type conda
 # -- start a TGI server endpoint
 llama stack run ./gpu/run.yaml
 ```
+
+### Model Serving
+To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
+
+This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
+
+```
+command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+```
+
+or by changing the docker run command's `--model-id` flag
+```
+docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009
+```
+
+In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model.
+```
+inference:
+  - provider_id: tgi0
+    provider_type: remote::tgi
+    config:
+      url: http://127.0.0.1:5009
+```
diff --git a/distributions/together/README.md b/distributions/together/README.md
index 1234cab7e..378b7c0c7 100644
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@@ -56,3 +56,26 @@ llama stack build --template together --image-type conda
 # -- modify run.yaml to a valid Together server endpoint
 llama stack run ./run.yaml
 ```
+
+### Model Serving
+
+Use `llama-stack-client models list` to check the available models served by together.
+
+```
+$ llama-stack-client models list
++------------------------------+------------------------------+---------------+------------+
+| identifier                   | llama_model                  | provider_id   | metadata   |
++==============================+==============================+===============+============+
+| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
++------------------------------+------------------------------+---------------+------------+
+```
diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml
index a08520464..87fd4dcd7 100644
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@@ -17,11 +17,18 @@ providers:
     provider_type: remote::together
     config:
       url: https://api.together.xyz/v1
+      # api_key: <ENTER_YOUR_API_KEY>
   safety:
-  - provider_id: together0
-    provider_type: remote::together
+  - provider_id: meta0
+    provider_type: meta-reference
     config:
-      url: https://api.together.xyz/v1
+      llama_guard_shield:
+        model: Llama-Guard-3-1B
+        excluded_categories: []
+        disable_input_check: false
+        disable_output_check: false
+      prompt_guard_shield:
+        model: Prompt-Guard-86M
   memory:
   - provider_id: meta0
     provider_type: remote::weaviate