From 85d0f5f5285e706ee2fb2ac3a07923bf0459beb2 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Tue, 17 Dec 2024 14:09:32 -0800 Subject: [PATCH] modify doc --- distributions/dependencies.json | 256 +++++++++--------- .../self_hosted_distro/meta-reference-gpu.md | 16 +- .../meta-reference-quantized-gpu.md | 14 +- .../meta-reference-gpu/doc_template.md | 12 +- .../meta-reference-gpu/meta_reference.py | 4 +- .../doc_template.md | 12 +- .../meta_reference.py | 2 +- 7 files changed, 158 insertions(+), 158 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 7a974b917..366a2a0f2 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,9 +1,9 @@ { - "hf-serverless": [ - "aiohttp", + "bedrock": [ "aiosqlite", "autoevals", "blobfile", + "boto3", "chardet", "chromadb-client", "datasets", @@ -11,100 +11,6 @@ "fastapi", "fire", "httpx", - "huggingface_hub", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "together": [ - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "chromadb-client", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "together", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "vllm-gpu": [ - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "chromadb-client", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "vllm", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "remote-vllm": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", "matplotlib", "nltk", "numpy", @@ -157,7 +63,7 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "tgi": [ + "hf-endpoint": [ "aiohttp", "aiosqlite", "autoevals", @@ -190,11 +96,11 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "bedrock": [ + "hf-serverless": [ + "aiohttp", "aiosqlite", "autoevals", "blobfile", - "boto3", "chardet", "chromadb-client", "datasets", @@ -202,6 +108,7 @@ "fastapi", "fire", "httpx", + "huggingface_hub", "matplotlib", "nltk", "numpy", @@ -300,34 +207,6 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "cerebras": [ - "aiosqlite", - "blobfile", - "cerebras_cloud_sdk", - "chardet", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], "ollama": [ "aiohttp", "aiosqlite", @@ -361,7 +240,7 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "hf-endpoint": [ + "tgi": [ "aiohttp", "aiosqlite", "autoevals", @@ -393,5 +272,126 @@ "uvicorn", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "together": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "together", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "remote-vllm": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "vllm-gpu": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "vllm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "cerebras": [ + "aiosqlite", + "blobfile", + "cerebras_cloud_sdk", + "chardet", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index d46039318..f6df8b1f3 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -31,9 +31,9 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu The following environment variables can be configured: - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) -- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`) +- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`) - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`) -- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) +- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `Llama-Guard-3-1B`) - `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`) @@ -63,7 +63,7 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-gpu \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -75,8 +75,8 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-gpu \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` ### Via Conda @@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a llama stack build --template meta-reference-gpu --image-type conda llama stack run distributions/meta-reference-gpu/run.yaml \ --port 5001 \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use: ```bash llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \ --port 5001 \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-Llama-Guard-3-1B ``` diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index 837be744a..73db33026 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -33,7 +33,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu The following environment variables can be configured: - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) -- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`) +- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `Llama3.2-3B-Instruct`) - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`) @@ -63,7 +63,7 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-quantized-gpu \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -75,8 +75,8 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-quantized-gpu \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-Llama-Guard-3-1B ``` ### Via Conda @@ -87,7 +87,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a llama stack build --template meta-reference-quantized-gpu --image-type conda llama stack run distributions/meta-reference-quantized-gpu/run.yaml \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -95,6 +95,6 @@ If you are using Llama Stack Safety / Shield APIs, use: ```bash llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md index 421812dbc..71653cfc1 100644 --- a/llama_stack/templates/meta-reference-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-gpu/doc_template.md @@ -53,7 +53,7 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -65,8 +65,8 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` ### Via Conda @@ -77,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a llama stack build --template {{ name }} --image-type conda llama stack run distributions/{{ name }}/run.yaml \ --port 5001 \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -85,6 +85,6 @@ If you are using Llama Stack Safety / Shield APIs, use: ```bash llama stack run distributions/{{ name }}/run-with-safety.yaml \ --port 5001 \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index 461d89a4a..0c809016c 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -112,7 +112,7 @@ def get_distribution_template() -> DistributionTemplate: "Port for the Llama Stack distribution server", ), "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", + "Llama3.2-3B-Instruct", "Inference model loaded into the Meta Reference server", ), "INFERENCE_CHECKPOINT_DIR": ( @@ -120,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate: "Directory containing the Meta Reference model checkpoint", ), "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", + "Llama-Guard-3-1B", "Name of the safety (Llama-Guard) model to use", ), "SAFETY_CHECKPOINT_DIR": ( diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md index daa380d20..897a5faf7 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md @@ -55,7 +55,7 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -67,8 +67,8 @@ docker run \ -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` ### Via Conda @@ -79,7 +79,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a llama stack build --template {{ name }} --image-type conda llama stack run distributions/{{ name }}/run.yaml \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct + --env INFERENCE_MODEL=Llama3.2-3B-Instruct ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -87,6 +87,6 @@ If you are using Llama Stack Safety / Shield APIs, use: ```bash llama stack run distributions/{{ name }}/run-with-safety.yaml \ --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + --env INFERENCE_MODEL=Llama3.2-3B-Instruct \ + --env SAFETY_MODEL=Llama-Guard-3-1B ``` diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index c460860c5..68d84ba67 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -84,7 +84,7 @@ def get_distribution_template() -> DistributionTemplate: "Port for the Llama Stack distribution server", ), "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", + "Llama3.2-3B-Instruct", "Inference model loaded into the Meta Reference server", ), "INFERENCE_CHECKPOINT_DIR": (