modify doc

2025-08-03 01:03:59 +00:00 · 2024-12-17 14:09:32 -08:00 · 2024-12-17 14:09:32 -08:00 · 85d0f5f528
commit 85d0f5f528
parent 486c0bc9c8
7 changed files with 158 additions and 158 deletions
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,9 +1,9 @@
 {
-  "hf-serverless": [
+  "bedrock": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "boto3",
    "chardet",
    "chromadb-client",
    "datasets",
@ -11,100 +11,6 @@
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "together",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "vllm-gpu": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
@ -157,7 +63,7 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "tgi": [
+  "hf-endpoint": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
@ -190,11 +96,11 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "bedrock": [
+  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "boto3",
    "chardet",
    "chromadb-client",
    "datasets",
@ -202,6 +108,7 @@
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
@ -300,34 +207,6 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "cerebras": [
    "aiosqlite",
    "blobfile",
    "cerebras_cloud_sdk",
    "chardet",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
@ -361,7 +240,7 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "hf-endpoint": [
+  "tgi": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
@ -393,5 +272,126 @@
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "together",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "vllm-gpu": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "cerebras": [
    "aiosqlite",
    "blobfile",
    "cerebras_cloud_sdk",
    "chardet",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -31,9 +31,9 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
 The following environment variables can be configured:
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
+- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `Llama-Guard-3-1B`)
 - `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`)
@ -63,7 +63,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -75,8 +75,8 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 ### Via Conda
@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template meta-reference-gpu --image-type conda
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
  --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=meta-Llama-Guard-3-1B
 ```
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -33,7 +33,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
 The following environment variables can be configured:
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
@ -63,7 +63,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-quantized-gpu \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -75,8 +75,8 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-quantized-gpu \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=meta-Llama-Guard-3-1B
 ```
 ### Via Conda
@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template meta-reference-quantized-gpu --image-type conda
 llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@ -53,7 +53,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -65,8 +65,8 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 ### Via Conda
@ -77,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template {{ name }} --image-type conda
 llama stack run distributions/{{ name }}/run.yaml \
  --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -85,6 +85,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
  --port 5001 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@ -112,7 +112,7 @@ def get_distribution_template() -> DistributionTemplate:
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
+                "Llama3.2-3B-Instruct",
                "Inference model loaded into the Meta Reference server",
            ),
            "INFERENCE_CHECKPOINT_DIR": (
@ -120,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate:
                "Directory containing the Meta Reference model checkpoint",
            ),
            "SAFETY_MODEL": (
-                "meta-llama/Llama-Guard-3-1B",
+                "Llama-Guard-3-1B",
                "Name of the safety (Llama-Guard) model to use",
            ),
            "SAFETY_CHECKPOINT_DIR": (
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@ -55,7 +55,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -67,8 +67,8 @@ docker run \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
 ### Via Conda
@ -79,7 +79,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 llama stack build --template {{ name }} --image-type conda
 llama stack run distributions/{{ name }}/run.yaml \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -87,6 +87,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env INFERENCE_MODEL=Llama3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --env SAFETY_MODEL=Llama-Guard-3-1B
 ```
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@ -84,7 +84,7 @@ def get_distribution_template() -> DistributionTemplate:
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
-                "meta-llama/Llama-3.2-3B-Instruct",
+                "Llama3.2-3B-Instruct",
                "Inference model loaded into the Meta Reference server",
            ),
            "INFERENCE_CHECKPOINT_DIR": (