apply anti affinity and separate PVCs for the models so the two vllms can be mapped to two nodes and avoid causing unnecessary memory pressure

This commit is contained in:
Ashwin Bharambe 2025-06-01 16:54:36 -07:00
parent 4121166784
commit 6f4f51f8d9
2 changed files with 38 additions and 5 deletions

View file

@ -1,5 +1,3 @@
# NOTE: this template does not really do any fancy node mapping or affinity declarations
# so the inference and safety models may land on the same GPU node
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
metadata: metadata:
@ -26,13 +24,24 @@ spec:
metadata: metadata:
labels: labels:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
workload-type: inference
spec: spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers: containers:
- name: vllm - name: vllm
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.5" - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:

View file

@ -1,3 +1,16 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models-safety
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
storageClassName: gp2
resources:
requests:
storage: 30Gi
---
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@ -11,13 +24,24 @@ spec:
metadata: metadata:
labels: labels:
app.kubernetes.io/name: vllm-safety app.kubernetes.io/name: vllm-safety
workload-type: inference
spec: spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers: containers:
- name: vllm-safety - name: vllm-safety
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: [ args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.28" "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.3"
] ]
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
@ -33,7 +57,7 @@ spec:
volumes: volumes:
- name: llama-storage - name: llama-storage
persistentVolumeClaim: persistentVolumeClaim:
claimName: vllm-models claimName: vllm-models-safety
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service