Auto-generate distro yamls + docs (#468)

# What does this PR do?

Automatically generates
- build.yaml
- run.yaml
- run-with-safety.yaml
- parts of markdown docs

for the distributions.

## Test Plan

At this point, this only updates the YAMLs and the docs. Some testing
(especially with ollama and vllm) has been performed but needs to be
much more tested.
This commit is contained in:
Ashwin Bharambe 2024-11-18 14:57:06 -08:00 committed by GitHub
parent 0784284ab5
commit 2a31163178
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
88 changed files with 3008 additions and 852 deletions

View file

@ -1,5 +1,4 @@
version: '2'
built_at: '2024-11-01T17:40:45.325529'
image_name: local
name: bedrock
docker_image: null

View file

@ -1,5 +1,4 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local

View file

@ -1,51 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: fireworks0
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference
# api_key: <ENTER_YOUR_API_KEY>
safety:
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
# Uncomment to use weaviate memory provider
# - provider_id: weaviate0
# provider_type: remote::weaviate
# config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/fireworks/run.yaml

View file

@ -1,5 +1,4 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml

View file

@ -1,69 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: inference0
provider_type: inline::meta-reference
config:
model: Llama3.2-3B-Instruct
quantization: null
torch_seed: null
max_seq_len: 4096
max_batch_size: 1
- provider_id: inference1
provider_type: inline::meta-reference
config:
model: Llama-Guard-3-1B
quantization: null
torch_seed: null
max_seq_len: 2048
max_batch_size: 1
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
# Uncomment to use prompt guard
# prompt_guard_shield:
# model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
# Uncomment to use pgvector
# - provider_id: pgvector
# provider_type: remote::pgvector
# config:
# host: 127.0.0.1
# port: 5432
# db: postgres
# user: postgres
# password: mysecretpassword
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/agents_store.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/run.yaml

View file

@ -1,5 +1,4 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local

View file

@ -1,5 +1,4 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
@ -13,20 +12,15 @@ apis:
- safety
providers:
inference:
- provider_id: ollama0
- provider_id: ollama
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
@ -43,3 +37,10 @@ providers:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
models:
- model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
provider_id: ollama
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: ollama
shields:
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}

View file

@ -1,30 +1,71 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
- ~/.ollama:/root/.ollama
ports:
- "11434:11434"
environment:
OLLAMA_DEBUG: 1
command: []
deploy:
resources:
limits:
memory: 8G # Set maximum memory
reservations:
memory: 8G # Set minimum memory reservation
# healthcheck:
# # ugh, no CURL in ollama image
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
# interval: 10s
# timeout: 5s
# retries: 5
ollama-init:
image: ollama/ollama:latest
depends_on:
- ollama
# condition: service_healthy
network_mode: ${NETWORK_MODE:-bridge}
environment:
- OLLAMA_HOST=ollama
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
volumes:
- ~/.ollama:/root/.ollama
- ./pull-models.sh:/pull-models.sh
entrypoint: ["/pull-models.sh"]
llamastack:
depends_on:
- ollama
image: llamastack/distribution-ollama
network_mode: "host"
ollama:
condition: service_started
ollama-init:
condition: service_started
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/my-run.yaml
- ~/local/llama-stack/:/app/llama-stack-source
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
environment:
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
- OLLAMA_URL=http://ollama:11434
entrypoint: >
python -m llama_stack.distribution.server.server /root/my-run.yaml \
--port ${LLAMA_STACK_PORT:-5001}
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
delay: 10s
max_attempts: 3
window: 60s
volumes:
ollama:
ollama-init:
llamastack:

View file

@ -0,0 +1,18 @@
#!/bin/sh
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
echo "Preloading $model..."
if ! ollama run "$model"; then
echo "Failed to pull and run $model"
exit 1
fi
done
echo "All models pulled successfully"

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/run-with-safety.yaml

View file

@ -1,45 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/run.yaml

View file

@ -1,33 +1,28 @@
# NOTES:
#
# This Docker Compose (and the associated run.yaml) assumes you will be
# running in the default "bridged" network mode.
#
# If you need "host" network mode, please uncomment
# - network_mode: "host"
#
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
#
services:
vllm-0:
vllm-inference:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "5100:5100"
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-3.1-8B-Instruct
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5100
--port ${VLLM_INFERENCE_PORT:-5100}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
@ -35,25 +30,34 @@ services:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
vllm-1:
# A little trick:
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
vllm-${VLLM_SAFETY_MODEL:+safety}:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "5101:5101"
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=1
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-Guard-3-1B
--model ${VLLM_SAFETY_MODEL}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5101
--port ${VLLM_SAFETY_PORT:-5101}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
@ -63,23 +67,25 @@ services:
runtime: nvidia
llamastack:
depends_on:
- vllm-0
- vllm-1
# image: llamastack/distribution-remote-vllm
- vllm-inference:
condition: service_healthy
- vllm-${VLLM_SAFETY_MODEL:+safety}:
condition: service_healthy
# image: llamastack/distribution-remote-vllm
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
volumes:
- ~/.llama:/root/.llama
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
# network_mode: "host"
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
network_mode: ${NETWORK_MODE:-bridged}
environment:
- LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
- LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- MAX_TOKENS=${MAX_TOKENS:-4096}
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
- LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
- LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
ports:
- "5001:5001"
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
# Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
deploy:
@ -89,6 +95,6 @@ services:
max_attempts: 5
window: 60s
volumes:
vllm-0:
vllm-1:
vllm-inference:
vllm-safety:
llamastack:

View file

@ -0,0 +1 @@
../../llama_stack/templates/remote-vllm/run-with-safety.yaml

View file

@ -1,68 +0,0 @@
version: '2'
built_at: '2024-11-11T20:09:45.988375'
image_name: remote-vllm
docker_image: remote-vllm
conda_env: null
apis:
- inference
- memory
- safety
- agents
- telemetry
providers:
inference:
# serves main inference model
- provider_id: vllm-0
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.LLAMA_INFERENCE_VLLM_URL:http://host.docker.internal:5100/v1}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
# serves safety llama_guard model
- provider_id: vllm-1
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.LLAMA_SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
memory:
- provider_id: faiss-0
provider_type: inline::faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/faiss_store.db"
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/agents_store.db"
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/registry.db"
models:
- model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}
provider_id: vllm-0
- model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: vllm-1
shields:
- shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}

View file

@ -0,0 +1 @@
../../llama_stack/templates/remote-vllm/run.yaml

View file

@ -1,51 +1,89 @@
services:
text-generation-inference:
tgi-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "5009:5009"
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_INFERENCE_PORT:-8080}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
tgi-${TGI_SAFETY_MODEL:+safety}:
image: ghcr.io/huggingface/text-generation-inference:latest
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
--port ${TGI_SAFETY_PORT:-8081}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
text-generation-inference:
tgi-inference:
condition: service_healthy
image: llamastack/distribution-tgi
network_mode: "host"
tgi-${TGI_SAFETY_MODEL:+safety}:
condition: service_healthy
image: llamastack/distribution-tgi:test-0.0.52rc3
network_mode: ${NETWORK_MODE:-bridged}
volumes:
- ~/.llama:/root/.llama
# Link to TGI run.yaml file
- ./run.yaml:/root/my-run.yaml
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "5000:5000"
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
@ -53,3 +91,13 @@ services:
delay: 3s
max_attempts: 5
window: 60s
environment:
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
volumes:
tgi-inference:
tgi-safety:
llamastack:

View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/run-with-safety.yaml

View file

@ -1,45 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

1
distributions/tgi/run.yaml Symbolic link
View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/run.yaml

View file

@ -1,46 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: together0
provider_type: remote::together
config:
url: https://api.together.xyz/v1
# api_key: <ENTER_YOUR_API_KEY>
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: remote::weaviate
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/together/run.yaml