forked from phoenix-oss/llama-stack-mirror
Auto-generate distro yamls + docs (#468)
# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
This commit is contained in:
parent
0784284ab5
commit
2a31163178
88 changed files with 3008 additions and 852 deletions
|
@ -1,33 +1,28 @@
|
|||
# NOTES:
|
||||
#
|
||||
# This Docker Compose (and the associated run.yaml) assumes you will be
|
||||
# running in the default "bridged" network mode.
|
||||
#
|
||||
# If you need "host" network mode, please uncomment
|
||||
# - network_mode: "host"
|
||||
#
|
||||
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
|
||||
#
|
||||
services:
|
||||
vllm-0:
|
||||
vllm-inference:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# network_mode: "host"
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5100:5100"
|
||||
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model meta-llama/Llama-3.1-8B-Instruct
|
||||
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port 5100
|
||||
--port ${VLLM_INFERENCE_PORT:-5100}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
|
@ -35,25 +30,34 @@ services:
|
|||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
vllm-1:
|
||||
|
||||
# A little trick:
|
||||
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
||||
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
||||
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# network_mode: "host"
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5101:5101"
|
||||
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=1
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model meta-llama/Llama-Guard-3-1B
|
||||
--model ${VLLM_SAFETY_MODEL}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port 5101
|
||||
--port ${VLLM_SAFETY_PORT:-5101}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
|
@ -63,23 +67,25 @@ services:
|
|||
runtime: nvidia
|
||||
llamastack:
|
||||
depends_on:
|
||||
- vllm-0
|
||||
- vllm-1
|
||||
# image: llamastack/distribution-remote-vllm
|
||||
- vllm-inference:
|
||||
condition: service_healthy
|
||||
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
# image: llamastack/distribution-remote-vllm
|
||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||
# network_mode: "host"
|
||||
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
environment:
|
||||
- LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
|
||||
- LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
||||
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
||||
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
||||
- LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
|
||||
- LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
ports:
|
||||
- "5001:5001"
|
||||
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
|
||||
# Hack: wait for vLLM server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
||||
deploy:
|
||||
|
@ -89,6 +95,6 @@ services:
|
|||
max_attempts: 5
|
||||
window: 60s
|
||||
volumes:
|
||||
vllm-0:
|
||||
vllm-1:
|
||||
vllm-inference:
|
||||
vllm-safety:
|
||||
llamastack:
|
||||
|
|
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run-with-safety.yaml
|
|
@ -1,68 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-11-11T20:09:45.988375'
|
||||
image_name: remote-vllm
|
||||
docker_image: remote-vllm
|
||||
conda_env: null
|
||||
apis:
|
||||
- inference
|
||||
- memory
|
||||
- safety
|
||||
- agents
|
||||
- telemetry
|
||||
providers:
|
||||
inference:
|
||||
# serves main inference model
|
||||
- provider_id: vllm-0
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||
url: ${env.LLAMA_INFERENCE_VLLM_URL:http://host.docker.internal:5100/v1}
|
||||
max_tokens: ${env.MAX_TOKENS:4096}
|
||||
api_token: fake
|
||||
# serves safety llama_guard model
|
||||
- provider_id: vllm-1
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||
url: ${env.LLAMA_SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
|
||||
max_tokens: ${env.MAX_TOKENS:4096}
|
||||
api_token: fake
|
||||
memory:
|
||||
- provider_id: faiss-0
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/faiss_store.db"
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::faiss
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/agents_store.db"
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/registry.db"
|
||||
models:
|
||||
- model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}
|
||||
provider_id: vllm-0
|
||||
- model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
|
||||
provider_id: vllm-1
|
||||
shields:
|
||||
- shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
|
1
distributions/remote-vllm/run.yaml
Symbolic link
1
distributions/remote-vllm/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run.yaml
|
Loading…
Add table
Add a link
Reference in a new issue