From ae671eaf7a04656cf95465f0927b9cc82bcb4bf3 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 28 Oct 2024 17:47:14 -0700 Subject: [PATCH] distro readmes with model serving instructions (#339) * readme updates * quantied compose * dell tgi * config update * readme * update model serving readmes * update * update * config --- distributions/fireworks/README.md | 26 +++++++++++++++++++++- distributions/fireworks/run.yaml | 5 +++++ distributions/meta-reference-gpu/README.md | 16 +++++++++++++ distributions/meta-reference-gpu/run.yaml | 9 ++++++++ distributions/ollama/README.md | 25 +++++++++++++++++++++ distributions/tgi/README.md | 23 +++++++++++++++++++ distributions/together/README.md | 23 +++++++++++++++++++ distributions/together/run.yaml | 13 ++++++++--- 8 files changed, 136 insertions(+), 4 deletions(-) diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md index b5bdd9c17..a753de429 100644 --- a/distributions/fireworks/README.md +++ b/distributions/fireworks/README.md @@ -43,7 +43,7 @@ inference: provider_type: remote::fireworks config: url: https://api.fireworks.ai/inference - api_key: + api_key: ``` **Via Conda** @@ -53,3 +53,27 @@ llama stack build --template fireworks --image-type conda # -- modify run.yaml to a valid Fireworks server endpoint llama stack run ./run.yaml ``` + +### Model Serving + +Use `llama-stack-client models list` to chekc the available models served by Fireworks. +``` +$ llama-stack-client models list ++------------------------------+------------------------------+---------------+------------+ +| identifier | llama_model | provider_id | metadata | ++==============================+==============================+===============+============+ +| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} | ++------------------------------+------------------------------+---------------+------------+ +``` diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml index c48b0cb7b..4363d86f3 100644 --- a/distributions/fireworks/run.yaml +++ b/distributions/fireworks/run.yaml @@ -17,6 +17,7 @@ providers: provider_type: remote::fireworks config: url: https://api.fireworks.ai/inference + # api_key: safety: - provider_id: meta0 provider_type: meta-reference @@ -32,6 +33,10 @@ providers: - provider_id: meta0 provider_type: meta-reference config: {} + # Uncomment to use weaviate memory provider + # - provider_id: weaviate0 + # provider_type: remote::weaviate + # config: {} agents: - provider_id: meta0 provider_type: meta-reference diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md index 677ea71e5..d4c49aff7 100644 --- a/distributions/meta-reference-gpu/README.md +++ b/distributions/meta-reference-gpu/README.md @@ -84,3 +84,19 @@ memory: ``` 3. Run `docker compose up` with the updated `run.yaml` file. + +### Serving a new model +You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`. +``` +inference: + - provider_id: meta0 + provider_type: meta-reference + config: + model: Llama3.2-11B-Vision-Instruct + quantization: null + torch_seed: null + max_seq_len: 4096 + max_batch_size: 1 +``` + +Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml index 724ca030a..9bf7655f9 100644 --- a/distributions/meta-reference-gpu/run.yaml +++ b/distributions/meta-reference-gpu/run.yaml @@ -36,6 +36,15 @@ providers: - provider_id: meta0 provider_type: meta-reference config: {} + # Uncomment to use pgvector + # - provider_id: pgvector + # provider_type: remote::pgvector + # config: + # host: 127.0.0.1 + # port: 5432 + # db: postgres + # user: postgres + # password: mysecretpassword agents: - provider_id: meta0 provider_type: meta-reference diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md index ad311c4a6..0d2ce6973 100644 --- a/distributions/ollama/README.md +++ b/distributions/ollama/README.md @@ -89,3 +89,28 @@ inference: llama stack build --template ollama --image-type conda llama stack run ./gpu/run.yaml ``` + +### Model Serving + +To serve a new model with `ollama` +``` +ollama run +``` + +To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. +``` +$ ollama ps + +NAME ID SIZE PROCESSOR UNTIL +llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now +``` + +To verify that the model served by ollama is correctly connected to Llama Stack server +``` +$ llama-stack-client models list ++----------------------+----------------------+---------------+-----------------------------------------------+ +| identifier | llama_model | provider_id | metadata | ++======================+======================+===============+===============================================+ +| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} | ++----------------------+----------------------+---------------+-----------------------------------------------+ +``` diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md index 0ea6eec5d..f274f8ff0 100644 --- a/distributions/tgi/README.md +++ b/distributions/tgi/README.md @@ -92,3 +92,26 @@ llama stack build --template tgi --image-type conda # -- start a TGI server endpoint llama stack run ./gpu/run.yaml ``` + +### Model Serving +To serve a new model with `tgi`, change the docker command flag `--model-id `. + +This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve. + +``` +command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] +``` + +or by changing the docker run command's `--model-id` flag +``` +docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009 +``` + +In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model. +``` +inference: + - provider_id: tgi0 + provider_type: remote::tgi + config: + url: http://127.0.0.1:5009 +``` diff --git a/distributions/together/README.md b/distributions/together/README.md index 1234cab7e..378b7c0c7 100644 --- a/distributions/together/README.md +++ b/distributions/together/README.md @@ -56,3 +56,26 @@ llama stack build --template together --image-type conda # -- modify run.yaml to a valid Together server endpoint llama stack run ./run.yaml ``` + +### Model Serving + +Use `llama-stack-client models list` to check the available models served by together. + +``` +$ llama-stack-client models list ++------------------------------+------------------------------+---------------+------------+ +| identifier | llama_model | provider_id | metadata | ++==============================+==============================+===============+============+ +| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} | ++------------------------------+------------------------------+---------------+------------+ +``` diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml index a08520464..87fd4dcd7 100644 --- a/distributions/together/run.yaml +++ b/distributions/together/run.yaml @@ -17,11 +17,18 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 + # api_key: safety: - - provider_id: together0 - provider_type: remote::together + - provider_id: meta0 + provider_type: meta-reference config: - url: https://api.together.xyz/v1 + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M memory: - provider_id: meta0 provider_type: remote::weaviate