llama-stack-mirror/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
2025-07-25 10:41:06 -07:00

69 lines
1.5 KiB
Text

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models-safety
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 30Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server-safety
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-safety
template:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
workload-type: inference
spec:
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.6"
]
env:
- name: SAFETY_MODEL
value: "${SAFETY_MODEL}"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8001
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models-safety
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server-safety
spec:
selector:
app.kubernetes.io/name: vllm-safety
ports:
- protocol: TCP
port: 8001
targetPort: 8001
type: ClusterIP