apiVersion: v1 kind: PersistentVolumeClaim metadata: name: vllm-models-safety spec: accessModes: - ReadWriteOnce volumeMode: Filesystem storageClassName: gp2 resources: requests: storage: 30Gi --- apiVersion: apps/v1 kind: Deployment metadata: name: vllm-server-safety spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: vllm-safety template: metadata: labels: app.kubernetes.io/name: vllm-safety workload-type: inference spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: workload-type operator: In values: - inference topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node containers: - name: vllm-safety image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: [ "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" ] env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-token-secret key: token ports: - containerPort: 8001 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface volumes: - name: llama-storage persistentVolumeClaim: claimName: vllm-models-safety --- apiVersion: v1 kind: Service metadata: name: vllm-server-safety spec: selector: app.kubernetes.io/name: vllm-safety ports: - protocol: TCP port: 8001 targetPort: 8001 type: ClusterIP