From 85d0f5f5285e706ee2fb2ac3a07923bf0459beb2 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Tue, 17 Dec 2024 14:09:32 -0800
Subject: [PATCH] modify doc

---
 distributions/dependencies.json               | 256 +++++++++---------
 .../self_hosted_distro/meta-reference-gpu.md  |  16 +-
 .../meta-reference-quantized-gpu.md           |  14 +-
 .../meta-reference-gpu/doc_template.md        |  12 +-
 .../meta-reference-gpu/meta_reference.py      |   4 +-
 .../doc_template.md                           |  12 +-
 .../meta_reference.py                         |   2 +-
 7 files changed, 158 insertions(+), 158 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 7a974b917..366a2a0f2 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,9 +1,9 @@
 {
-  "hf-serverless": [
-    "aiohttp",
+  "bedrock": [
     "aiosqlite",
     "autoevals",
     "blobfile",
+    "boto3",
     "chardet",
     "chromadb-client",
     "datasets",
@@ -11,100 +11,6 @@
     "fastapi",
     "fire",
     "httpx",
-    "huggingface_hub",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "together": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "together",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "vllm-gpu": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "vllm",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "remote-vllm": [
-    "aiosqlite",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
     "matplotlib",
     "nltk",
     "numpy",
@@ -157,7 +63,7 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "tgi": [
+  "hf-endpoint": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -190,11 +96,11 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "bedrock": [
+  "hf-serverless": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
-    "boto3",
     "chardet",
     "chromadb-client",
     "datasets",
@@ -202,6 +108,7 @@
     "fastapi",
     "fire",
     "httpx",
+    "huggingface_hub",
     "matplotlib",
     "nltk",
     "numpy",
@@ -300,34 +207,6 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "cerebras": [
-    "aiosqlite",
-    "blobfile",
-    "cerebras_cloud_sdk",
-    "chardet",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
   "ollama": [
     "aiohttp",
     "aiosqlite",
@@ -361,7 +240,7 @@
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "hf-endpoint": [
+  "tgi": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -393,5 +272,126 @@
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "together": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "remote-vllm": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "cerebras": [
+    "aiosqlite",
+    "blobfile",
+    "cerebras_cloud_sdk",
+    "chardet",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index d46039318..f6df8b1f3 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -31,9 +31,9 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
 The following environment variables can be configured:
 
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
-- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
-- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
+- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `Llama-Guard-3-1B`)
 - `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`)
 
 
@@ -63,7 +63,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-gpu \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -75,8 +75,8 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-gpu \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 
 ### Via Conda
@@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template meta-reference-gpu --image-type conda
 llama stack run distributions/meta-reference-gpu/run.yaml \
   --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
   --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-Llama-Guard-3-1B
 ```
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index 837be744a..73db33026 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -33,7 +33,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
 The following environment variables can be configured:
 
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
-- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
 
 
@@ -63,7 +63,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-quantized-gpu \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -75,8 +75,8 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-meta-reference-quantized-gpu \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-Llama-Guard-3-1B
 ```
 
 ### Via Conda
@@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template meta-reference-quantized-gpu --image-type conda
 llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index 421812dbc..71653cfc1 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -53,7 +53,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -65,8 +65,8 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 
 ### Via Conda
@@ -77,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template {{ name }} --image-type conda
 llama stack run distributions/{{ name }}/run.yaml \
   --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -85,6 +85,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
   --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py
index 461d89a4a..0c809016c 100644
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@@ -112,7 +112,7 @@ def get_distribution_template() -> DistributionTemplate:
                 "Port for the Llama Stack distribution server",
             ),
             "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
+                "Llama3.2-3B-Instruct",
                 "Inference model loaded into the Meta Reference server",
             ),
             "INFERENCE_CHECKPOINT_DIR": (
@@ -120,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate:
                 "Directory containing the Meta Reference model checkpoint",
             ),
             "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
+                "Llama-Guard-3-1B",
                 "Name of the safety (Llama-Guard) model to use",
             ),
             "SAFETY_CHECKPOINT_DIR": (
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
index daa380d20..897a5faf7 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@@ -55,7 +55,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -67,8 +67,8 @@ docker run \
   -v ~/.llama:/root/.llama \
   llamastack/distribution-{{ name }} \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 
 ### Via Conda
@@ -79,7 +79,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template {{ name }} --image-type conda
 llama stack run distributions/{{ name }}/run.yaml \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -87,6 +87,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
   --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
index c460860c5..68d84ba67 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@@ -84,7 +84,7 @@ def get_distribution_template() -> DistributionTemplate:
                 "Port for the Llama Stack distribution server",
             ),
             "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
+                "Llama3.2-3B-Instruct",
                 "Inference model loaded into the Meta Reference server",
             ),
             "INFERENCE_CHECKPOINT_DIR": (