Update vllm compose and run YAMLs

This commit is contained in:
Ashwin Bharambe 2024-11-12 12:46:32 -08:00
parent afe4a53ae8
commit 1245a625ce
2 changed files with 107 additions and 42 deletions

View file

@ -1,43 +1,83 @@
# NOTES:
#
# This Docker Compose (and the associated run.yaml) assumes you will be
# running in the default "bridged" network mode.
#
# If you need "host" network mode, please uncomment
# - network_mode: "host"
# and comment the lines with port mapping
# - ports:
# - "5100:5100"
#
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
#
services: services:
vllm: vllm-0:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
network_mode: "host"
volumes: volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface - $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
ports: ports:
- "8000:8000" - "5100:5100"
devices: devices:
- nvidia.com/gpu=all - nvidia.com/gpu=all
environment: environment:
- CUDA_VISIBLE_DEVICES=0 - CUDA_VISIBLE_DEVICES=4
command: [] - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-3.1-8B-Instruct
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5100
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
vllm-1:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
ports:
- "5101:5101"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=5
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-Guard-3-1B
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5101
deploy: deploy:
resources: resources:
reservations: reservations:
devices: devices:
- driver: nvidia - driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu] capabilities: [gpu]
runtime: nvidia runtime: nvidia
llamastack: llamastack:
depends_on: depends_on:
- vllm - vllm-0
image: llamastack/distribution-remote-vllm - vllm-1
network_mode: "host" # image: llamastack/distribution-remote-vllm
image: localhost/distribution-remote-vllm:test-0.0.52rc3
volumes: volumes:
- ~/.llama:/root/.llama - ~/.llama:/root/.llama
# Link to ollama run.yaml file - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
- ./run.yaml:/root/llamastack-run-remote-vllm.yaml # network_mode: "host"
ports: ports:
- "5000:5000" - "5001:5001"
# Hack: wait for vllm server to start before starting docker # Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml" entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
deploy: deploy:
restart_policy: restart_policy:
condition: on-failure condition: on-failure
@ -45,4 +85,6 @@ services:
max_attempts: 5 max_attempts: 5
window: 60s window: 60s
volumes: volumes:
vllm: vllm-0:
vllm-1:
llamastack:

View file

@ -1,35 +1,47 @@
version: '2' version: '2'
built_at: '2024-10-08T17:40:45.325529' built_at: '2024-11-11T20:09:45.988375'
image_name: local image_name: remote-vllm
docker_image: null docker_image: remote-vllm
conda_env: local conda_env: null
apis: apis:
- shields
- agents
- models
- memory
- memory_banks
- inference - inference
- memory
- safety - safety
- agents
- telemetry
providers: providers:
inference: inference:
- provider_id: vllm0 # serves main inference model
- provider_id: vllm-0
provider_type: remote::vllm provider_type: remote::vllm
config: config:
url: http://127.0.0.1:8000 # NOTE: replace with "localhost" if you are running in "host" network mode
url: http://host.docker.internal:5100/v1
max_tokens: 4096
api_token: fake
# serves safety llama_guard model
- provider_id: vllm-1
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: http://host.docker.internal:5101/v1
max_tokens: 4096
api_token: fake
memory:
- provider_id: faiss-0
provider_type: inline::faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: /home/ashwin/.llama/distributions/remote-vllm/faiss_store.db
safety: safety:
- provider_id: meta0 - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard
config: config: {}
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory: memory:
- provider_id: meta0 - provider_id: meta0
provider_type: inline::meta-reference provider_type: inline::faiss
config: {} config: {}
agents: agents:
- provider_id: meta0 - provider_id: meta0
@ -38,8 +50,19 @@ providers:
persistence_store: persistence_store:
namespace: null namespace: null
type: sqlite type: sqlite
db_path: ~/.llama/runtime/kvstore.db db_path: /home/ashwin/.llama/distributions/remote-vllm/agents_store.db
telemetry: telemetry:
- provider_id: meta0 - provider_id: meta0
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: {} config: {}
metadata_store:
namespace: null
type: sqlite
db_path: /home/ashwin/.llama/distributions/remote-vllm/registry.db
models:
- model_id: Llama3.1-8B-Instruct
provider_id: vllm-0
- model_id: Llama-Guard-3-1B
provider_id: vllm-1
shields:
- shield_id: Llama-Guard-3-1B