apply anti affinity and separate PVCs for the models so the two vllms can be mapped to two nodes and avoid causing unnecessary memory pressure

This commit is contained in:
Ashwin Bharambe 2025-06-01 16:54:36 -07:00
parent 4121166784
commit 6f4f51f8d9
2 changed files with 38 additions and 5 deletions

View file

@ -1,5 +1,3 @@
# NOTE: this template does not really do any fancy node mapping or affinity declarations
# so the inference and safety models may land on the same GPU node
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
@ -26,13 +24,24 @@ spec:
metadata:
labels:
app.kubernetes.io/name: vllm
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers:
- name: vllm
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.5"
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:

View file

@ -1,3 +1,16 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models-safety
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
storageClassName: gp2
resources:
requests:
storage: 30Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
@ -11,13 +24,24 @@ spec:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.28"
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.3"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
@ -33,7 +57,7 @@ spec:
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
claimName: vllm-models-safety
---
apiVersion: v1
kind: Service