apiVersion: v1 kind: PersistentVolumeClaim metadata: name: vllm-models spec: accessModes: - ReadWriteOnce volumeMode: Filesystem resources: requests: storage: 50Gi --- apiVersion: apps/v1 kind: Deployment metadata: name: vllm-server spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: vllm template: metadata: labels: app.kubernetes.io/name: vllm app: vllm workload-type: inference annotations: prometheus.io/scrape: 'true' prometheus.io/port: '8001' prometheus.io/path: '/metrics' spec: # Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster # If you have GPU nodes with a different label, you can uncomment and modify this section # nodeSelector: # : containers: - name: vllm image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001" env: - name: INFERENCE_MODEL value: "${INFERENCE_MODEL}" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-token-secret key: token ports: - containerPort: 8001 name: http resources: limits: nvidia.com/gpu: 1 requests: nvidia.com/gpu: 1 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface volumes: - name: llama-storage persistentVolumeClaim: claimName: vllm-models --- apiVersion: v1 kind: Service metadata: name: vllm-server spec: selector: app.kubernetes.io/name: vllm ports: - protocol: TCP port: 8001 targetPort: 8001 name: http type: ClusterIP