split off safety so it can be applied one at a time

This commit is contained in:
Ashwin Bharambe 2025-06-01 15:59:00 -07:00
parent d93f6c9e5b
commit 4121166784
5 changed files with 62 additions and 65 deletions

View file

@ -32,7 +32,7 @@ spec:
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.55"
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.5"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
@ -48,66 +48,3 @@ spec:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server-safety
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-safety
template:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
spec:
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.28"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8001
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server-safety
spec:
selector:
app.kubernetes.io/name: vllm-safety
ports:
- protocol: TCP
port: 8001
targetPort: 8001
type: ClusterIP