adhoc, test, rm storage class

# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
2025-12-24 07:08:04 +00:00 · 2025-07-08 15:54:20 -07:00 · 2025-07-08 15:54:20 -07:00 · 2f51129495
commit 2f51129495
parent 83c89265e0
9 changed files with 100 additions and 32 deletions
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -25,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
@ -42,6 +34,8 @@ spec:
        args:
        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
        env:
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -49,6 +43,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface