make it work on gpus

2025-07-13 00:26:10 +00:00 · 2025-06-24 12:20:04 -07:00 · 2025-06-24 12:20:04 -07:00 · f99ca37f91
commit f99ca37f91
parent ee96c4891b
7 changed files with 30 additions and 21 deletions
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -6,6 +6,7 @@ spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
+  storageClassName: gp2
  resources:
    requests:
      storage: 50Gi
@ -25,16 +26,8 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
@ -49,6 +42,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface