mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-24 05:14:30 +00:00
Auto-generate distro yamls + docs (#468)
# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
This commit is contained in:
parent
0784284ab5
commit
2a31163178
88 changed files with 3008 additions and 852 deletions
|
@ -1,51 +1,89 @@
|
|||
services:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5009:5009"
|
||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_INFERENCE_PORT:-8080}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
--port ${TGI_SAFETY_PORT:-8081}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
|
||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi
|
||||
network_mode: "host"
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to TGI run.yaml file
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "5000:5000"
|
||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
||||
# Hack: wait for TGI server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
restart_policy:
|
||||
|
@ -53,3 +91,13 @@ services:
|
|||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
environment:
|
||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
|
||||
volumes:
|
||||
tgi-inference:
|
||||
tgi-safety:
|
||||
llamastack:
|
||||
|
|
1
distributions/tgi/run-with-safety.yaml
Symbolic link
1
distributions/tgi/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run-with-safety.yaml
|
|
@ -1,45 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: tgi0
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: http://127.0.0.1:5009
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/tgi/run.yaml
Symbolic link
1
distributions/tgi/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run.yaml
|
Loading…
Add table
Add a link
Reference in a new issue