diff --git a/distributions/meta-reference-gpu/run-with-safety.yaml b/distributions/meta-reference-gpu/run-with-safety.yaml new file mode 100644 index 000000000..7d01159df --- /dev/null +++ b/distributions/meta-reference-gpu/run-with-safety.yaml @@ -0,0 +1,70 @@ +version: '2' +image_name: meta-reference-gpu +docker_image: null +conda_env: null +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: meta-reference-inference + provider_type: inline::meta-reference + config: + model: ${env.INFERENCE_MODEL} + max_seq_len: 4096 + checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} + - provider_id: meta-reference-safety + provider_type: inline::meta-reference + config: + model: ${env.SAFETY_MODEL} + max_seq_len: 4096 + checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:null} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: meta-reference-inference + provider_model_id: null +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: meta-reference-safety + provider_model_id: null +shields: +- params: null + shield_id: ${env.SAFETY_MODEL} + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml index 3645cb75b..c67ba60cd 100644 --- a/distributions/meta-reference-gpu/run.yaml +++ b/distributions/meta-reference-gpu/run.yaml @@ -1,68 +1,56 @@ version: '2' -image_name: local +image_name: meta-reference-gpu docker_image: null -conda_env: local +conda_env: null apis: -- shields - agents -- models -- memory -- memory_banks - inference +- memory - safety +- telemetry providers: inference: - - provider_id: inference0 + - provider_id: meta-reference-inference provider_type: inline::meta-reference config: - model: Llama3.2-3B-Instruct - quantization: null - torch_seed: null + model: ${env.INFERENCE_MODEL} max_seq_len: 4096 - max_batch_size: 1 - - provider_id: inference1 - provider_type: inline::meta-reference - config: - model: Llama-Guard-3-1B - quantization: null - torch_seed: null - max_seq_len: 2048 - max_batch_size: 1 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M -# Uncomment to use prompt guard -# prompt_guard_shield: -# model: Prompt-Guard-86M + checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} memory: - - provider_id: meta0 - provider_type: inline::meta-reference + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard config: {} - # Uncomment to use pgvector - # - provider_id: pgvector - # provider_type: remote::pgvector - # config: - # host: 127.0.0.1 - # port: 5432 - # db: postgres - # user: postgres - # password: mysecretpassword agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: ~/.llama/runtime/agents_store.db + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: meta-reference-inference + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md index 1d5842c07..a0add3858 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/meta-reference-gpu.md @@ -1,15 +1,23 @@ # Meta Reference Distribution -The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations. +The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations: + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `inline::meta-reference` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | -| **Provider(s)** | meta-reference | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference | +Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs. -### Step 0. Prerequisite - Downloading Models -Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. + +## Prerequisite: Downloading Models + +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. ``` $ ls ~/.llama/checkpoints @@ -17,55 +25,56 @@ Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3 Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M ``` -### Step 1. Start the Distribution +## Running the Distribution -#### (Option 1) Start with Docker -``` -$ cd distributions/meta-reference-gpu && docker compose up +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-meta-reference-gpu \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` -> [!NOTE] -> This assumes you have access to GPU to start a local server with access to your GPU. +If you are using Llama Stack Safety / Shield APIs, use: - -> [!NOTE] -> `~/.llama` should be the path containing downloaded weights of Llama models. - - -This will download and start running a pre-built docker container. Alternatively, you may use the following commands: - -``` -docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml +```bash +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-meta-reference-gpu \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B ``` -#### (Option 2) Start with Conda +### Via Conda -1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. -2. Build the `meta-reference-gpu` distribution - -``` -$ llama stack build --template meta-reference-gpu --image-type conda +```bash +llama stack build --template meta-reference-gpu --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` -3. Start running distribution -``` -$ cd distributions/meta-reference-gpu -$ llama stack run ./run.yaml -``` +If you are using Llama Stack Safety / Shield APIs, use: -### (Optional) Serving a new model -You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`. +```bash +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B ``` -inference: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - model: Llama3.2-11B-Vision-Instruct - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 -``` - -Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. diff --git a/llama_stack/providers/inline/inference/meta_reference/config.py b/llama_stack/providers/inline/inference/meta_reference/config.py index 48cba645b..11648b117 100644 --- a/llama_stack/providers/inline/inference/meta_reference/config.py +++ b/llama_stack/providers/inline/inference/meta_reference/config.py @@ -49,6 +49,18 @@ class MetaReferenceInferenceConfig(BaseModel): resolved = resolve_model(self.model) return resolved.pth_file_count + @classmethod + def sample_run_config( + cls, + model: str = "Llama3.2-3B-Instruct", + checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}", + ) -> Dict[str, Any]: + return { + "model": model, + "max_seq_len": 4096, + "checkpoint_dir": checkpoint_dir, + } + class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig): quantization: QuantizationConfig diff --git a/llama_stack/providers/inline/inference/meta_reference/generation.py b/llama_stack/providers/inline/inference/meta_reference/generation.py index 38c982473..577f5184b 100644 --- a/llama_stack/providers/inline/inference/meta_reference/generation.py +++ b/llama_stack/providers/inline/inference/meta_reference/generation.py @@ -107,7 +107,7 @@ class Llama: sys.stdout = open(os.devnull, "w") start_time = time.time() - if config.checkpoint_dir: + if config.checkpoint_dir and config.checkpoint_dir != "null": ckpt_dir = config.checkpoint_dir else: ckpt_dir = model_checkpoint_dir(model) @@ -137,7 +137,6 @@ class Llama: ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}" if isinstance(config, MetaReferenceQuantizedInferenceConfig): - if isinstance(config.quantization, Fp8QuantizationConfig): from .quantization.loader import convert_to_fp8_quantized_model diff --git a/llama_stack/templates/meta-reference-gpu/__init__.py b/llama_stack/templates/meta-reference-gpu/__init__.py new file mode 100644 index 000000000..1cfdb2c6a --- /dev/null +++ b/llama_stack/templates/meta-reference-gpu/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .meta_reference import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml index 7c468e41c..ef075d098 100644 --- a/llama_stack/templates/meta-reference-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -1,13 +1,19 @@ +version: '2' name: meta-reference-gpu distribution_spec: - docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime - description: Use code from `llama_stack` itself to serve all llama stack APIs + description: Use Meta Reference for running LLM inference + docker_image: null providers: - inference: meta-reference + inference: + - inline::meta-reference memory: - inline::faiss - remote::chromadb - remote::pgvector - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md new file mode 100644 index 000000000..9a61ff691 --- /dev/null +++ b/llama_stack/templates/meta-reference-gpu/doc_template.md @@ -0,0 +1,82 @@ +# Meta Reference Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: + +{{ providers_table }} + +Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs. + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + + +## Prerequisite: Downloading Models + +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. + +``` +$ ls ~/.llama/checkpoints +Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B +Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M +``` + +## Running the Distribution + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +``` + +### Via Conda + +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. + +```bash +llama stack build --template meta-reference-gpu --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ + --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +``` diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py new file mode 100644 index 000000000..04bf889c2 --- /dev/null +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.meta_reference import ( + MetaReferenceInferenceConfig, +) +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["inline::meta-reference"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="meta-reference-inference", + provider_type="inline::meta-reference", + config=MetaReferenceInferenceConfig.sample_run_config( + model="${env.INFERENCE_MODEL}", + checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", + ), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="meta-reference-inference", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="meta-reference-safety", + ) + + return DistributionTemplate( + name="meta-reference-gpu", + distro_type="self_hosted", + description="Use Meta Reference for running LLM inference", + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + Provider( + provider_id="meta-reference-safety", + provider_type="inline::meta-reference", + config=MetaReferenceInferenceConfig.sample_run_config( + model="${env.SAFETY_MODEL}", + checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}", + ), + ), + ], + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], + ), + }, + docker_compose_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model loaded into the Meta Reference server", + ), + "INFERENCE_CHECKPOINT_DIR": ( + "null", + "Directory containing the Meta Reference model checkpoint", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Name of the safety (Llama-Guard) model to use", + ), + "SAFETY_CHECKPOINT_DIR": ( + "null", + "Directory containing the Llama-Guard model checkpoint", + ), + }, + )